feat:资料库重构,增加文本抽取及校对状态等

This commit is contained in:
刘亮
2025-03-11 14:13:13 +08:00
parent 55a8559f9a
commit d8102c88f7
11 changed files with 239 additions and 27 deletions

View File

@@ -9,6 +9,7 @@ import lombok.Getter;
import lombok.Setter;
import java.util.List;
import java.util.Set;
/**
* Project: groot-data-bank
@@ -135,4 +136,37 @@ public class DatasetLibrary extends Library {
@JsonProperty("word_count")
@JSONField(name = "word_count")
private Long wordCount;
/**
* 已抽取页数
*/
@JsonProperty("extraction_pages")
@JSONField(name = "extraction_pages")
private Set<Integer> extractionPages;
/**
* (文本)抽取状态
*/
@JsonProperty("extraction_state")
@JSONField(name = "extraction_state")
private String extractionState;
/**
* 已校对页数
*/
@JsonProperty("confirm_pages")
@JSONField(name = "confirm_pages")
private Set<Integer> confirmPages;
/**
* (文本)校对状态
*/
@JsonProperty("confirm_state")
@JSONField(name = "confirm_state")
private String confirmState;
/**
* 原文来源地址
*/
private String articleSourceUrl;
}

View File

@@ -1,6 +1,8 @@
package com.shuwen.groot.api.dto.library;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.annotation.JSONField;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Getter;
import lombok.Setter;
@@ -29,4 +31,18 @@ public class PageLibrary extends Library {
* 复杂结构抽取结果,含文字、表格、图片
*/
private JSONArray segment;
/**
* (文本)抽取状态
*/
@JsonProperty("extraction_state")
@JSONField(name = "extraction_state")
private String extractionState;
/**
* (文本)校对状态
*/
@JsonProperty("confirm_state")
@JSONField(name = "confirm_state")
private String confirmState;
}

View File

@@ -0,0 +1,27 @@
package com.shuwen.groot.api.enums;
/**
* 资料库(文本)校对状态
* Project: groot-data-bank
* Description:
* Author: liuliang
* Create: 2025/3/7 17:40
*/
public enum LibraryConfirmState {
/**
* 不需要校对
*/
NO_NEED,
/**
* 未校对
*/
NOT,
/**
* 部分校对
*/
PARTIAL,
/**
* 已(全部)校对
*/
ALL
}

View File

@@ -0,0 +1,27 @@
package com.shuwen.groot.api.enums;
/**
* 资料库(文本)抽取状态
* Project: groot-data-bank
* Description:
* Author: liuliang
* Create: 2025/3/7 17:40
*/
public enum LibraryExtractionState {
/**
* 不需要抽取
*/
NO_NEED,
/**
* 未抽取
*/
NOT,
/**
* 部分抽取
*/
PARTIAL,
/**
* 已(全部)抽取
*/
ALL
}

View File

@@ -132,6 +132,12 @@ public class LibraryImportTask implements Serializable {
* 字数
*/
private Long wordCount;
/**
* 原文来源地址
*/
private String articleSourceUrl;
/**
* 任务状态
*/

View File

@@ -79,6 +79,14 @@ public class CollectedDataManager {
return ret.getJSONObject("data");
}
public JSONArray taskGetPage(JSONObject param) {
String url = collectedUrl + "/api/graph/knowledge/dataset/getPage";
String response = latencyHttpHandler.doPost(url, new JSONObject(), new JSONObject(), param.toJSONString());
JSONObject ret = JSON.parseObject(response);
checkExpression(ret.getBooleanValue("success"), InternalErrorCode.DATASET_ERROR, "页面结果查询失败: " + ret.getString("msg"));
return ret.getJSONArray("data");
}
public JSONArray taskSplitGet(JSONObject param) {
String url = collectedUrl + "/api/graph/knowledge/dataset/split/get";
String response = latencyHttpHandler.doPost(url, new JSONObject(), new JSONObject(), param.toJSONString());

View File

@@ -44,6 +44,12 @@
<groupId>com.shuwen.groot</groupId>
<artifactId>groot-data-bank-manager</artifactId>
</dependency>
<dependency>
<groupId>com.shuwen.data</groupId>
<artifactId>groot-stream-api</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<!-- project end -->
<!-- tools start -->

View File

@@ -8,14 +8,7 @@ import com.shuwen.groot.manager.constant.Constant;
import com.shuwen.search.proxy.api.entity.base.BoolQuery;
import com.shuwen.search.proxy.api.entity.base.FieldFilter;
import com.shuwen.search.proxy.api.entity.base.FieldNest;
import com.shuwen.search.proxy.api.entity.dto.common.CrudReqDto;
import com.shuwen.search.proxy.api.entity.dto.common.CrudRespDto;
import com.shuwen.search.proxy.api.entity.dto.common.DeleteByQueryReqDto;
import com.shuwen.search.proxy.api.entity.dto.common.FilterReqDto;
import com.shuwen.search.proxy.api.entity.dto.common.ItemsRespDto;
import com.shuwen.search.proxy.api.entity.dto.common.StatReqDto;
import com.shuwen.search.proxy.api.entity.dto.common.StatRespDto;
import com.shuwen.search.proxy.api.entity.dto.common.TextFullReqDto;
import com.shuwen.search.proxy.api.entity.dto.common.*;
import com.shuwen.search.proxy.api.entity.enums.FieldFilterTypeEnum;
import com.shuwen.search.proxy.api.service.ICrudService;
import com.shuwen.search.proxy.api.service.IFilterService;
@@ -138,6 +131,18 @@ public abstract class BaseSearchHandler {
}
}
protected void upsert(CrudReqDto reqDto, int retryTimes) {
try {
CrudRespDto respDto = crudService().upsert(reqDto);
if (!respDto.getSucceed()) {
throw new DataBankException(InternalErrorCode.ELASTIC_ERROR, respDto.getMessage());
}
} catch (Exception e) {
checkExpression(retryTimes < Constant.MAX_RETRY_TIME, InternalErrorCode.ELASTIC_ERROR, e);
upsert(reqDto, ++retryTimes);
}
}
protected void delete(CrudReqDto reqDto, int retryTimes) {
try {
CrudRespDto respDto = crudService().delete(reqDto);

View File

@@ -769,7 +769,7 @@ public class LibraryServiceImpl implements ILibraryService {
}
// 获取抽取状态
result.put("extraction_state", getExtractionState(task.getPhotocopy(), extraction, task.getPageCount(), extractedPageSet.size()));
//result.put("extraction_state", getExtractionState(task.getPhotocopy(), extraction, task.getPageCount(), extractedPageSet.size()));
} else {
extractedPageSet = Sets.newHashSet();
}
@@ -916,7 +916,7 @@ public class LibraryServiceImpl implements ILibraryService {
}
// 获取抽取状态
library.put("extraction_state", getExtractionState(task.getPhotocopy(), extraction, task.getPageCount(), extractedPageSet.size()));
//library.put("extraction_state", getExtractionState(task.getPhotocopy(), extraction, task.getPageCount(), extractedPageSet.size()));
} else {
extractedPageSet = Sets.newHashSet();
}
@@ -1231,7 +1231,7 @@ public class LibraryServiceImpl implements ILibraryService {
}
}
private String getExtractionState(boolean photocopy, boolean extraction, Integer pageCount, int alreadyExtractedPageCount) {
/*private String getExtractionState(boolean photocopy, boolean extraction, Integer pageCount, int alreadyExtractedPageCount) {
// 获取抽取状态
if (!photocopy && !extraction) {
return "NO_NEED";
@@ -1248,9 +1248,8 @@ public class LibraryServiceImpl implements ILibraryService {
return "PARTIAL";
} else {
return "ALL";
}
}
}*/
@SuppressWarnings("unchecked")
private void postItemsExplainHandler(List<JSONObject> items, SearchLibraryRequest

View File

@@ -6,20 +6,17 @@ import com.amazonaws.services.s3.model.S3Object;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.shuwen.groot.api.dto.library.DatasetEntity;
import com.shuwen.groot.api.dto.library.DatasetLibrary;
import com.shuwen.groot.api.dto.library.Library;
import com.shuwen.groot.api.dto.library.LibraryVector;
import com.shuwen.groot.api.dto.library.PageLibrary;
import com.shuwen.groot.api.dto.library.SectionLibrary;
import com.shuwen.groot.api.enums.LibraryFileType;
import com.shuwen.groot.api.enums.LibraryLevel;
import com.shuwen.groot.api.enums.LibraryVectorLevel;
import com.google.common.collect.Sets;
import com.shuwen.data.graph.stream.api.enums.DatasetConfirmEnum;
import com.shuwen.data.graph.stream.api.enums.DatasetExtractionEnum;
import com.shuwen.groot.api.dto.library.*;
import com.shuwen.groot.api.enums.*;
import com.shuwen.groot.common.enums.InternalErrorCode;
import com.shuwen.groot.common.exception.DataBankException;
import com.shuwen.groot.common.utils.IDUtils;
import com.shuwen.groot.common.utils.TimeUtils;
import com.shuwen.groot.dao.entity.LibraryImportTask;
import com.shuwen.groot.manager.collected.CollectedDataManager;
import com.shuwen.groot.manager.configmap.EmbeddingConfig;
import com.shuwen.groot.manager.ding.DingTalkNotifier;
import com.shuwen.groot.manager.ding.MarkDownMessage;
@@ -52,11 +49,7 @@ import java.io.FileOutputStream;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.charset.StandardCharsets;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicBoolean;
@@ -103,6 +96,9 @@ public abstract class LibraryTaskProcessor {
@Resource
private Executor indexExecutor;
@Resource
private CollectedDataManager collectedDataManager;
@Resource
private DingTalkNotifier dingTalkNotifier;
@@ -440,6 +436,10 @@ public abstract class LibraryTaskProcessor {
library.setPhotocopy(task.getPhotocopy());
library.setSearchable(context.isSearchable());
library.setWordCount(task.getWordCount());
//增加对文本抽取状态及校对状态的处理
attachStatus(task,context,library);
library.setGraph(task.getGraph());
library.setUserId(task.getUserId());
library.setTenantId(task.getTenantId());
@@ -448,6 +448,87 @@ public abstract class LibraryTaskProcessor {
return library;
}
private void attachStatus(LibraryImportTask task, LibraryIndexContext context, DatasetLibrary library){
// 只有PDF和ZIP文件包可以进行抽取
boolean extraction = task.getFileType() == LibraryFileType.pdf || task.getFileType() == LibraryFileType.zip;
// 获取已抽取的页码只有pdf和zip需要抽取
if (extraction) {
JSONArray pageArray = collectedDataManager.taskGetPage(new JSONObject().fluentPut("datasetId", task.getDatasetId()));
if(CollectionUtils.isNotEmpty(pageArray)){
JSONArray datasetPageArray = collectedDataManager.taskGetPage(new JSONObject().fluentPut("datasetId", task.getDatasetId()));
Set<Integer> extractedPageSet = Sets.newHashSet();
Set<Integer> confirmPageSet = Sets.newHashSet();
if(CollectionUtils.isNotEmpty(datasetPageArray)) {
for (Object t : datasetPageArray) {
JSONObject datasetPageJson = (JSONObject) t;
if (DatasetExtractionEnum.extracted.name().equals(datasetPageJson.getString("extraction_state"))) {
extractedPageSet.add(datasetPageJson.getInteger("sort_index"));
}
if (DatasetConfirmEnum.confirmed.name().equals(datasetPageJson.getString("confirm_state"))) {
confirmPageSet.add(datasetPageJson.getInteger("sort_index"));
}
}
}
//extractedPageSet = collectedDataManager.taskGetDigitalizedPage(new JSONObject().fluentPut("datasetId", task.getDatasetId()));
LibraryExtractionState libraryExtractionState = getExtractionState(task.getPhotocopy(), extraction ,task.getPageCount(),extractedPageSet.size());
library.setExtractionPages(extractedPageSet);
library.setExtractionState(libraryExtractionState.name());
//confirmPageSet = collectedDataManager.taskGetDigitalizedPage(new JSONObject().fluentPut("datasetId", task.getDatasetId()));
LibraryConfirmState libraryConfirmState = getConfirmState(task.getPhotocopy(), extraction ,task.getPageCount(),confirmPageSet.size());
library.setConfirmPages(confirmPageSet);
library.setConfirmState(libraryConfirmState.name());
}
} else {
library.setExtractionState(LibraryExtractionState.NO_NEED.name());
library.setConfirmState(LibraryConfirmState.NO_NEED.name());
}
}
private LibraryExtractionState getExtractionState(boolean photocopy, boolean extraction, Integer pageCount, int alreadyExtractedPageCount) {
// 获取抽取状态
if (!photocopy && !extraction) {
return LibraryExtractionState.NO_NEED;
}
if (!photocopy) {
return LibraryExtractionState.ALL;
}
if (pageCount == null || pageCount <= 0) {
return LibraryExtractionState.NO_NEED;
}
if (alreadyExtractedPageCount == 0) {
return LibraryExtractionState.NOT;
} else if (alreadyExtractedPageCount < pageCount) {
return LibraryExtractionState.PARTIAL;
} else {
return LibraryExtractionState.ALL;
}
}
private LibraryConfirmState getConfirmState(boolean photocopy, boolean extraction, Integer pageCount, int alreadyConfirmPageCount) {
// 获取抽取状态
if (!photocopy && !extraction) {
return LibraryConfirmState.NO_NEED;
}
if (!photocopy) {
return LibraryConfirmState.ALL;
}
if (pageCount == null || pageCount <= 0) {
return LibraryConfirmState.NO_NEED;
}
if (alreadyConfirmPageCount == 0) {
return LibraryConfirmState.NOT;
} else if (alreadyConfirmPageCount < pageCount) {
return LibraryConfirmState.PARTIAL;
} else {
return LibraryConfirmState.ALL;
}
}
private JSONObject normAddress(String address, String graph) {
JSONObject property = new JSONObject()
.fluentPut("struct_address", address);

View File

@@ -241,6 +241,9 @@ public class ParsedLibraryTaskProcessor extends LibraryTaskProcessor {
}
pageLibrary.setSegment(structDetailJson.getJSONArray("segment"));
pageLibrary.setExtractionState(structDetailJson.getString("extraction_state"));
pageLibrary.setConfirmState(structDetailJson.getString("confirm_state"));
pageLibrary.setGraph(task.getGraph());
pageLibrary.setUserId(task.getUserId());
pageLibrary.setTenantId(task.getTenantId());