feat:资料库重构,增加文本抽取及校对状态等
This commit is contained in:
@@ -9,6 +9,7 @@ import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Project: groot-data-bank
|
||||
@@ -135,4 +136,37 @@ public class DatasetLibrary extends Library {
|
||||
@JsonProperty("word_count")
|
||||
@JSONField(name = "word_count")
|
||||
private Long wordCount;
|
||||
|
||||
/**
|
||||
* 已抽取页数
|
||||
*/
|
||||
@JsonProperty("extraction_pages")
|
||||
@JSONField(name = "extraction_pages")
|
||||
private Set<Integer> extractionPages;
|
||||
|
||||
/**
|
||||
* (文本)抽取状态
|
||||
*/
|
||||
@JsonProperty("extraction_state")
|
||||
@JSONField(name = "extraction_state")
|
||||
private String extractionState;
|
||||
|
||||
/**
|
||||
* 已校对页数
|
||||
*/
|
||||
@JsonProperty("confirm_pages")
|
||||
@JSONField(name = "confirm_pages")
|
||||
private Set<Integer> confirmPages;
|
||||
|
||||
/**
|
||||
* (文本)校对状态
|
||||
*/
|
||||
@JsonProperty("confirm_state")
|
||||
@JSONField(name = "confirm_state")
|
||||
private String confirmState;
|
||||
|
||||
/**
|
||||
* 原文来源地址
|
||||
*/
|
||||
private String articleSourceUrl;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package com.shuwen.groot.api.dto.library;
|
||||
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.alibaba.fastjson.annotation.JSONField;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
@@ -29,4 +31,18 @@ public class PageLibrary extends Library {
|
||||
* 复杂结构抽取结果,含文字、表格、图片
|
||||
*/
|
||||
private JSONArray segment;
|
||||
|
||||
/**
|
||||
* (文本)抽取状态
|
||||
*/
|
||||
@JsonProperty("extraction_state")
|
||||
@JSONField(name = "extraction_state")
|
||||
private String extractionState;
|
||||
|
||||
/**
|
||||
* (文本)校对状态
|
||||
*/
|
||||
@JsonProperty("confirm_state")
|
||||
@JSONField(name = "confirm_state")
|
||||
private String confirmState;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
package com.shuwen.groot.api.enums;
|
||||
|
||||
/**
|
||||
* 资料库(文本)校对状态
|
||||
* Project: groot-data-bank
|
||||
* Description:
|
||||
* Author: liuliang
|
||||
* Create: 2025/3/7 17:40
|
||||
*/
|
||||
public enum LibraryConfirmState {
|
||||
/**
|
||||
* 不需要校对
|
||||
*/
|
||||
NO_NEED,
|
||||
/**
|
||||
* 未校对
|
||||
*/
|
||||
NOT,
|
||||
/**
|
||||
* 部分校对
|
||||
*/
|
||||
PARTIAL,
|
||||
/**
|
||||
* 已(全部)校对
|
||||
*/
|
||||
ALL
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
package com.shuwen.groot.api.enums;
|
||||
|
||||
/**
|
||||
* 资料库(文本)抽取状态
|
||||
* Project: groot-data-bank
|
||||
* Description:
|
||||
* Author: liuliang
|
||||
* Create: 2025/3/7 17:40
|
||||
*/
|
||||
public enum LibraryExtractionState {
|
||||
/**
|
||||
* 不需要抽取
|
||||
*/
|
||||
NO_NEED,
|
||||
/**
|
||||
* 未抽取
|
||||
*/
|
||||
NOT,
|
||||
/**
|
||||
* 部分抽取
|
||||
*/
|
||||
PARTIAL,
|
||||
/**
|
||||
* 已(全部)抽取
|
||||
*/
|
||||
ALL
|
||||
}
|
||||
@@ -132,6 +132,12 @@ public class LibraryImportTask implements Serializable {
|
||||
* 字数
|
||||
*/
|
||||
private Long wordCount;
|
||||
|
||||
/**
|
||||
* 原文来源地址
|
||||
*/
|
||||
private String articleSourceUrl;
|
||||
|
||||
/**
|
||||
* 任务状态
|
||||
*/
|
||||
|
||||
@@ -79,6 +79,14 @@ public class CollectedDataManager {
|
||||
return ret.getJSONObject("data");
|
||||
}
|
||||
|
||||
public JSONArray taskGetPage(JSONObject param) {
|
||||
String url = collectedUrl + "/api/graph/knowledge/dataset/getPage";
|
||||
String response = latencyHttpHandler.doPost(url, new JSONObject(), new JSONObject(), param.toJSONString());
|
||||
JSONObject ret = JSON.parseObject(response);
|
||||
checkExpression(ret.getBooleanValue("success"), InternalErrorCode.DATASET_ERROR, "页面结果查询失败: " + ret.getString("msg"));
|
||||
return ret.getJSONArray("data");
|
||||
}
|
||||
|
||||
public JSONArray taskSplitGet(JSONObject param) {
|
||||
String url = collectedUrl + "/api/graph/knowledge/dataset/split/get";
|
||||
String response = latencyHttpHandler.doPost(url, new JSONObject(), new JSONObject(), param.toJSONString());
|
||||
|
||||
@@ -44,6 +44,12 @@
|
||||
<groupId>com.shuwen.groot</groupId>
|
||||
<artifactId>groot-data-bank-manager</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.shuwen.data</groupId>
|
||||
<artifactId>groot-stream-api</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<!-- project end -->
|
||||
|
||||
<!-- tools start -->
|
||||
|
||||
@@ -8,14 +8,7 @@ import com.shuwen.groot.manager.constant.Constant;
|
||||
import com.shuwen.search.proxy.api.entity.base.BoolQuery;
|
||||
import com.shuwen.search.proxy.api.entity.base.FieldFilter;
|
||||
import com.shuwen.search.proxy.api.entity.base.FieldNest;
|
||||
import com.shuwen.search.proxy.api.entity.dto.common.CrudReqDto;
|
||||
import com.shuwen.search.proxy.api.entity.dto.common.CrudRespDto;
|
||||
import com.shuwen.search.proxy.api.entity.dto.common.DeleteByQueryReqDto;
|
||||
import com.shuwen.search.proxy.api.entity.dto.common.FilterReqDto;
|
||||
import com.shuwen.search.proxy.api.entity.dto.common.ItemsRespDto;
|
||||
import com.shuwen.search.proxy.api.entity.dto.common.StatReqDto;
|
||||
import com.shuwen.search.proxy.api.entity.dto.common.StatRespDto;
|
||||
import com.shuwen.search.proxy.api.entity.dto.common.TextFullReqDto;
|
||||
import com.shuwen.search.proxy.api.entity.dto.common.*;
|
||||
import com.shuwen.search.proxy.api.entity.enums.FieldFilterTypeEnum;
|
||||
import com.shuwen.search.proxy.api.service.ICrudService;
|
||||
import com.shuwen.search.proxy.api.service.IFilterService;
|
||||
@@ -138,6 +131,18 @@ public abstract class BaseSearchHandler {
|
||||
}
|
||||
}
|
||||
|
||||
protected void upsert(CrudReqDto reqDto, int retryTimes) {
|
||||
try {
|
||||
CrudRespDto respDto = crudService().upsert(reqDto);
|
||||
if (!respDto.getSucceed()) {
|
||||
throw new DataBankException(InternalErrorCode.ELASTIC_ERROR, respDto.getMessage());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
checkExpression(retryTimes < Constant.MAX_RETRY_TIME, InternalErrorCode.ELASTIC_ERROR, e);
|
||||
upsert(reqDto, ++retryTimes);
|
||||
}
|
||||
}
|
||||
|
||||
protected void delete(CrudReqDto reqDto, int retryTimes) {
|
||||
try {
|
||||
CrudRespDto respDto = crudService().delete(reqDto);
|
||||
|
||||
@@ -769,7 +769,7 @@ public class LibraryServiceImpl implements ILibraryService {
|
||||
}
|
||||
|
||||
// 获取抽取状态
|
||||
result.put("extraction_state", getExtractionState(task.getPhotocopy(), extraction, task.getPageCount(), extractedPageSet.size()));
|
||||
//result.put("extraction_state", getExtractionState(task.getPhotocopy(), extraction, task.getPageCount(), extractedPageSet.size()));
|
||||
} else {
|
||||
extractedPageSet = Sets.newHashSet();
|
||||
}
|
||||
@@ -916,7 +916,7 @@ public class LibraryServiceImpl implements ILibraryService {
|
||||
}
|
||||
|
||||
// 获取抽取状态
|
||||
library.put("extraction_state", getExtractionState(task.getPhotocopy(), extraction, task.getPageCount(), extractedPageSet.size()));
|
||||
//library.put("extraction_state", getExtractionState(task.getPhotocopy(), extraction, task.getPageCount(), extractedPageSet.size()));
|
||||
} else {
|
||||
extractedPageSet = Sets.newHashSet();
|
||||
}
|
||||
@@ -1231,7 +1231,7 @@ public class LibraryServiceImpl implements ILibraryService {
|
||||
}
|
||||
}
|
||||
|
||||
private String getExtractionState(boolean photocopy, boolean extraction, Integer pageCount, int alreadyExtractedPageCount) {
|
||||
/*private String getExtractionState(boolean photocopy, boolean extraction, Integer pageCount, int alreadyExtractedPageCount) {
|
||||
// 获取抽取状态
|
||||
if (!photocopy && !extraction) {
|
||||
return "NO_NEED";
|
||||
@@ -1248,9 +1248,8 @@ public class LibraryServiceImpl implements ILibraryService {
|
||||
return "PARTIAL";
|
||||
} else {
|
||||
return "ALL";
|
||||
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void postItemsExplainHandler(List<JSONObject> items, SearchLibraryRequest
|
||||
|
||||
@@ -6,20 +6,17 @@ import com.amazonaws.services.s3.model.S3Object;
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.shuwen.groot.api.dto.library.DatasetEntity;
|
||||
import com.shuwen.groot.api.dto.library.DatasetLibrary;
|
||||
import com.shuwen.groot.api.dto.library.Library;
|
||||
import com.shuwen.groot.api.dto.library.LibraryVector;
|
||||
import com.shuwen.groot.api.dto.library.PageLibrary;
|
||||
import com.shuwen.groot.api.dto.library.SectionLibrary;
|
||||
import com.shuwen.groot.api.enums.LibraryFileType;
|
||||
import com.shuwen.groot.api.enums.LibraryLevel;
|
||||
import com.shuwen.groot.api.enums.LibraryVectorLevel;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.shuwen.data.graph.stream.api.enums.DatasetConfirmEnum;
|
||||
import com.shuwen.data.graph.stream.api.enums.DatasetExtractionEnum;
|
||||
import com.shuwen.groot.api.dto.library.*;
|
||||
import com.shuwen.groot.api.enums.*;
|
||||
import com.shuwen.groot.common.enums.InternalErrorCode;
|
||||
import com.shuwen.groot.common.exception.DataBankException;
|
||||
import com.shuwen.groot.common.utils.IDUtils;
|
||||
import com.shuwen.groot.common.utils.TimeUtils;
|
||||
import com.shuwen.groot.dao.entity.LibraryImportTask;
|
||||
import com.shuwen.groot.manager.collected.CollectedDataManager;
|
||||
import com.shuwen.groot.manager.configmap.EmbeddingConfig;
|
||||
import com.shuwen.groot.manager.ding.DingTalkNotifier;
|
||||
import com.shuwen.groot.manager.ding.MarkDownMessage;
|
||||
@@ -52,11 +49,7 @@ import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.math.BigDecimal;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
@@ -103,6 +96,9 @@ public abstract class LibraryTaskProcessor {
|
||||
@Resource
|
||||
private Executor indexExecutor;
|
||||
|
||||
@Resource
|
||||
private CollectedDataManager collectedDataManager;
|
||||
|
||||
@Resource
|
||||
private DingTalkNotifier dingTalkNotifier;
|
||||
|
||||
@@ -440,6 +436,10 @@ public abstract class LibraryTaskProcessor {
|
||||
library.setPhotocopy(task.getPhotocopy());
|
||||
library.setSearchable(context.isSearchable());
|
||||
library.setWordCount(task.getWordCount());
|
||||
|
||||
//增加对文本抽取状态及校对状态的处理
|
||||
attachStatus(task,context,library);
|
||||
|
||||
library.setGraph(task.getGraph());
|
||||
library.setUserId(task.getUserId());
|
||||
library.setTenantId(task.getTenantId());
|
||||
@@ -448,6 +448,87 @@ public abstract class LibraryTaskProcessor {
|
||||
return library;
|
||||
}
|
||||
|
||||
private void attachStatus(LibraryImportTask task, LibraryIndexContext context, DatasetLibrary library){
|
||||
// 只有PDF和ZIP文件包可以进行抽取
|
||||
boolean extraction = task.getFileType() == LibraryFileType.pdf || task.getFileType() == LibraryFileType.zip;
|
||||
|
||||
// 获取已抽取的页码,只有pdf和zip需要抽取
|
||||
if (extraction) {
|
||||
JSONArray pageArray = collectedDataManager.taskGetPage(new JSONObject().fluentPut("datasetId", task.getDatasetId()));
|
||||
|
||||
if(CollectionUtils.isNotEmpty(pageArray)){
|
||||
JSONArray datasetPageArray = collectedDataManager.taskGetPage(new JSONObject().fluentPut("datasetId", task.getDatasetId()));
|
||||
|
||||
Set<Integer> extractedPageSet = Sets.newHashSet();
|
||||
Set<Integer> confirmPageSet = Sets.newHashSet();
|
||||
if(CollectionUtils.isNotEmpty(datasetPageArray)) {
|
||||
for (Object t : datasetPageArray) {
|
||||
JSONObject datasetPageJson = (JSONObject) t;
|
||||
if (DatasetExtractionEnum.extracted.name().equals(datasetPageJson.getString("extraction_state"))) {
|
||||
extractedPageSet.add(datasetPageJson.getInteger("sort_index"));
|
||||
}
|
||||
if (DatasetConfirmEnum.confirmed.name().equals(datasetPageJson.getString("confirm_state"))) {
|
||||
confirmPageSet.add(datasetPageJson.getInteger("sort_index"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//extractedPageSet = collectedDataManager.taskGetDigitalizedPage(new JSONObject().fluentPut("datasetId", task.getDatasetId()));
|
||||
LibraryExtractionState libraryExtractionState = getExtractionState(task.getPhotocopy(), extraction ,task.getPageCount(),extractedPageSet.size());
|
||||
library.setExtractionPages(extractedPageSet);
|
||||
library.setExtractionState(libraryExtractionState.name());
|
||||
|
||||
//confirmPageSet = collectedDataManager.taskGetDigitalizedPage(new JSONObject().fluentPut("datasetId", task.getDatasetId()));
|
||||
LibraryConfirmState libraryConfirmState = getConfirmState(task.getPhotocopy(), extraction ,task.getPageCount(),confirmPageSet.size());
|
||||
library.setConfirmPages(confirmPageSet);
|
||||
library.setConfirmState(libraryConfirmState.name());
|
||||
}
|
||||
} else {
|
||||
library.setExtractionState(LibraryExtractionState.NO_NEED.name());
|
||||
library.setConfirmState(LibraryConfirmState.NO_NEED.name());
|
||||
}
|
||||
}
|
||||
|
||||
private LibraryExtractionState getExtractionState(boolean photocopy, boolean extraction, Integer pageCount, int alreadyExtractedPageCount) {
|
||||
// 获取抽取状态
|
||||
if (!photocopy && !extraction) {
|
||||
return LibraryExtractionState.NO_NEED;
|
||||
}
|
||||
if (!photocopy) {
|
||||
return LibraryExtractionState.ALL;
|
||||
}
|
||||
if (pageCount == null || pageCount <= 0) {
|
||||
return LibraryExtractionState.NO_NEED;
|
||||
}
|
||||
if (alreadyExtractedPageCount == 0) {
|
||||
return LibraryExtractionState.NOT;
|
||||
} else if (alreadyExtractedPageCount < pageCount) {
|
||||
return LibraryExtractionState.PARTIAL;
|
||||
} else {
|
||||
return LibraryExtractionState.ALL;
|
||||
}
|
||||
}
|
||||
|
||||
private LibraryConfirmState getConfirmState(boolean photocopy, boolean extraction, Integer pageCount, int alreadyConfirmPageCount) {
|
||||
// 获取抽取状态
|
||||
if (!photocopy && !extraction) {
|
||||
return LibraryConfirmState.NO_NEED;
|
||||
}
|
||||
if (!photocopy) {
|
||||
return LibraryConfirmState.ALL;
|
||||
}
|
||||
if (pageCount == null || pageCount <= 0) {
|
||||
return LibraryConfirmState.NO_NEED;
|
||||
}
|
||||
if (alreadyConfirmPageCount == 0) {
|
||||
return LibraryConfirmState.NOT;
|
||||
} else if (alreadyConfirmPageCount < pageCount) {
|
||||
return LibraryConfirmState.PARTIAL;
|
||||
} else {
|
||||
return LibraryConfirmState.ALL;
|
||||
}
|
||||
}
|
||||
|
||||
private JSONObject normAddress(String address, String graph) {
|
||||
JSONObject property = new JSONObject()
|
||||
.fluentPut("struct_address", address);
|
||||
|
||||
@@ -241,6 +241,9 @@ public class ParsedLibraryTaskProcessor extends LibraryTaskProcessor {
|
||||
}
|
||||
|
||||
pageLibrary.setSegment(structDetailJson.getJSONArray("segment"));
|
||||
pageLibrary.setExtractionState(structDetailJson.getString("extraction_state"));
|
||||
pageLibrary.setConfirmState(structDetailJson.getString("confirm_state"));
|
||||
|
||||
pageLibrary.setGraph(task.getGraph());
|
||||
pageLibrary.setUserId(task.getUserId());
|
||||
pageLibrary.setTenantId(task.getTenantId());
|
||||
|
||||
Reference in New Issue
Block a user