重大的是:思路和方法

步骤一:表结构设计
知识库表:用于管理文档
CREATE TABLE `airag_knowledge` (
`id` bigint NOT NULL AUTO_INCREMENT,
`name` varchar(100) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '知识库名称',
`descr` varchar(500) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '描述',
`embed_id` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '向量模型id',
`status` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '状态',
`tenant_id` varchar(20) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '租户id',
`create_dept` bigint DEFAULT NULL COMMENT '创建部门',
`create_by` bigint DEFAULT NULL COMMENT '创建者',
`create_time` datetime DEFAULT NULL COMMENT '创建时间',
`update_by` bigint DEFAULT NULL COMMENT '更新者',
`update_time` datetime DEFAULT NULL COMMENT '更新时间',
`del_flag` char(1) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT '0' COMMENT '删除标志(0代表存在 2代表删除)',
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 ROW_FORMAT=DYNAMIC COMMENT='知识库表';
文档表:用于存储文件信息
CREATE TABLE `airag_knowledge_doc` (
`id` bigint NOT NULL AUTO_INCREMENT,
`knowledge_id` bigint DEFAULT NULL COMMENT '知识库id',
`title` varchar(100) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '标题',
`type` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '类型',
`content` text CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci COMMENT '内容',
`status` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '状态',
`metadata` text CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci COMMENT '元数据',
`tenant_id` varchar(20) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '租户id',
`create_dept` bigint DEFAULT NULL COMMENT '创建部门',
`create_by` bigint DEFAULT NULL COMMENT '创建者',
`create_time` datetime DEFAULT NULL COMMENT '创建时间',
`update_by` bigint DEFAULT NULL COMMENT '更新者',
`update_time` datetime DEFAULT NULL COMMENT '更新时间',
`del_flag` char(1) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT '0' COMMENT '删除标志(0代表存在 2代表删除)',
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 ROW_FORMAT=DYNAMIC COMMENT='知识库文档表';
模型配置表:用户切换模型、构建LLM、Embed模型
CREATE TABLE `airag_model` (
`id` bigint NOT NULL AUTO_INCREMENT,
`name` varchar(100) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '名称',
`provider` varchar(50) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '供应者',
`model_name` varchar(100) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '模型名称',
`credential` varchar(500) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '凭证信息',
`base_url` varchar(500) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT 'API域名',
`model_type` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '模型类型',
`model_params` varchar(500) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '模型参数',
`tenant_id` varchar(20) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '租户id',
`create_dept` bigint DEFAULT NULL COMMENT '创建部门',
`create_by` bigint DEFAULT NULL COMMENT '创建者',
`create_time` datetime DEFAULT NULL COMMENT '创建时间',
`update_by` bigint DEFAULT NULL COMMENT '更新者',
`update_time` datetime DEFAULT NULL COMMENT '更新时间',
`del_flag` char(1) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT '0' COMMENT '删除标志(0代表存在 2代表删除)',
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 ROW_FORMAT=DYNAMIC COMMENT='模型管理表';
向量库表:存储文档的向量数据
CREATE TABLE "public"."embeddings" (
"embedding_id" uuid NOT NULL,
"embedding" "public"."vector",
"text" text COLLATE "pg_catalog"."default",
"metadata" json,
CONSTRAINT "vector_1_pkey" PRIMARY KEY ("embedding_id")
);
步骤二:知识库功能开发
接口1:分页查询知识库
Controller层
@GetMapping("/list")
public TableDataInfo<AiragKnowledgeVo> list(AiragKnowledgeBo bo, PageQuery pageQuery) {
return airagKnowledgeService.queryPageList(bo, pageQuery);
}
Service层
@Override
public TableDataInfo<AiragKnowledgeVo> queryPageList(AiragKnowledgeBo bo, PageQuery pageQuery) {
LambdaQueryWrapper<AiragKnowledge> lqw = buildQueryWrapper(bo);
Page<AiragKnowledgeVo> result = airagKnowledgeMapper.selectVoPage(pageQuery.build(), lqw);
return TableDataInfo.build(result);
}
接口2:知识库详情
Controller层
@GetMapping("/{id}")
public R<AiragKnowledgeVo> getInfo(@NotNull(message = "主键不能为空")
@PathVariable String id) {
return R.ok(airagKnowledgeService.queryById(id));
}
Service层
@Override
public AiragKnowledgeVo queryById(String id) {
return airagKnowledgeMapper.selectVoById(id);
}
接口3:新增知识库
Controller层
@PostMapping("/add")
public R<Void> add(@Validated(AddGroup.class) @RequestBody AiragKnowledgeBo bo) {
return toAjax(airagKnowledgeService.insertByBo(bo));
}
Service层
@Override
public Boolean insertByBo(AiragKnowledgeBo bo) {
AiragKnowledge add = MapstructUtils.convert(bo, AiragKnowledge.class);
add.setStatus(LLMConsts.STATUS_ENABLE);
boolean flag = airagKnowledgeMapper.insert(add) > 0;
if (flag) {
bo.setId(add.getId());
}
return flag;
}
接口4:修改知识库
Controller层
@Transactional(rollbackFor = Exception.class)
@PutMapping("/edit")
public R<Void> edit(@Validated(EditGroup.class) @RequestBody AiragKnowledgeBo bo) {
return toAjax(airagKnowledgeService.updateByBo(bo));
}
Service层
@Override
public Boolean updateByBo(AiragKnowledgeBo bo) {
AiragKnowledge update = MapstructUtils.convert(bo, AiragKnowledge.class);
AiragKnowledge airagKnowledgeEntity = airagKnowledgeMapper.selectById(update.getId());
if (ObjectUtil.isEmpty(airagKnowledgeEntity)) {
throw new ServiceException("未找到对应数据");
}
String oldEmbedId = airagKnowledgeEntity.getEmbedId();
int updated = airagKnowledgeMapper.updateById(update);
if (updated < 0) {
return false;
}
// 如果向量模型更改,需要重建
if (!oldEmbedId.equalsIgnoreCase(update.getEmbedId())) {
// 更新了模型,重建文档
iairagKnowledgeDocService.rebuildDocumentByKnowId(String.valueOf(update.getId()));
}
return true;
}
接口5:批量删除知识库
Controller层
@DeleteMapping("/{ids}")
public R<Void> remove(@NotEmpty(message = "主键不能为空")
@PathVariable String[] ids) {
return toAjax(airagKnowledgeService.deleteWithValidByIds(List.of(ids), true));
}
Service层
@Override
public Boolean deleteWithValidByIds(List<String> ids, Boolean isValid) {
iairagKnowledgeDocService.removeByKnowIds(ids);
airagKnowledgeMapper.deleteByIds(ids);
return true;
}
注:此处在删除知识库之前,同时也要删除该库存在的文档,且对应的向量数据也要删除!
步骤三:文档功能开发
接口1:文档分页查询
Controller层
@GetMapping("/doc/list")
public TableDataInfo<AiragKnowledgeDocVo> list(AiragKnowledgeDocBo bo, PageQuery pageQuery) {
return airagKnowledgeDocService.queryPageList(bo, pageQuery);
}
Service层
@Override
public TableDataInfo<AiragKnowledgeDocVo> queryPageList(AiragKnowledgeDocBo bo, PageQuery pageQuery) {
if (ObjectUtil.isEmpty(bo.getKnowledgeId())) {
throw new ServiceException("请先选择知识库");
}
LambdaQueryWrapper<AiragKnowledgeDoc> lqw = buildQueryWrapper(bo);
Page<AiragKnowledgeDocVo> result = airagKnowledgeDocMapper.selectVoPage(pageQuery.build(), lqw);
return TableDataInfo.build(result);
}
接口2:文档详情
Controller层
@GetMapping("/{id}")
public R<AiragKnowledgeDocVo> getInfo(@NotNull(message = "主键不能为空")
@PathVariable String id) {
return R.ok(airagKnowledgeDocService.queryById(id));
}
Service层
@Override
public AiragKnowledgeDocVo queryById(String id) {
return airagKnowledgeDocMapper.selectVoById(id);
}
接口3:新增文档
Controller层
@PostMapping("/add")
public R<Void> add(@Validated(AddGroup.class) @RequestBody AiragKnowledgeDocBo bo) {
return toAjax(airagKnowledgeDocService.insertByBo(bo));
}
Service层
@Transactional(rollbackFor = {Exception.class})
@Override
public Boolean insertByBo(AiragKnowledgeDocBo bo) {
AiragKnowledgeDoc add = MapstructUtils.convert(bo, AiragKnowledgeDoc.class);
validEntityBeforeSave(add);
add.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);
boolean flag = airagKnowledgeDocMapper.insert(add) > 0;
if (flag) {
// 重建向量
return rebuildDocument(String.valueOf(add.getId()));
} else {
return false;
}
}
注:新增文档需要向量化,存入postgres库中,方便后期查询、向量检索
接口4:修改文档
Controller层
@PutMapping("/edit")
public R<Void> edit(@Validated(EditGroup.class) @RequestBody AiragKnowledgeDocBo bo) {
return toAjax(airagKnowledgeDocService.updateByBo(bo));
}
Service层
@Transactional(rollbackFor = {Exception.class})
@Override
public Boolean updateByBo(AiragKnowledgeDocBo bo) {
AiragKnowledgeDoc update = MapstructUtils.convert(bo, AiragKnowledgeDoc.class);
validEntityBeforeSave(update);
update.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);
boolean flag = airagKnowledgeDocMapper.updateById(update) > 0;
if (flag) {
// 重建向量
return rebuildDocument(String.valueOf(update.getId()));
} else {
return false;
}
}
注:编辑向量化同理!
接口5:批量删除文档
Controller层
@Transactional(rollbackFor = Exception.class)
@DeleteMapping("/{ids}")
public R<Void> remove(@NotEmpty(message = "主键不能为空")
@PathVariable String[] ids) {
return toAjax(airagKnowledgeDocService.deleteWithValidByIds(List.of(ids), true));
}
Service层
@Override
public Boolean deleteWithValidByIds(List<String> ids, Boolean isValid) {
return airagKnowledgeDocMapper.deleteByIds(ids) > 0;
}
接口6:压缩包导入文档
Controller层
@PostMapping(value = "/doc/import/zip")
public R<Void> importDocumentFromZip(@RequestParam(name = "knowId", required = true) String knowId,
@RequestParam(name = "file", required = true) MultipartFile file) {
return toAjax(airagKnowledgeDocService.importDocumentFromZip(knowId, file));
}
Service层
@Transactional(rollbackFor = {Exception.class})
@Override
public Boolean importDocumentFromZip(String knowId, MultipartFile zipFile) {
if (ObjectUtil.isEmpty(knowId)) {
throw new ServiceException("请先选择知识库");
}
if (ObjectUtil.isEmpty(zipFile)) {
throw new ServiceException("请上传文件");
}
long startTime = System.currentTimeMillis();
try {
String bizPath = knowId + File.separator + UUIDGenerator.generate();
String workDir = getUploadPath() + File.separator + bizPath + File.separator;
String sourcesPath = workDir + "files";
SsrfFileTypeFilter.checkUploadFileType(zipFile);
// 通过filePath 检查文件是不是压缩包(zip)
String zipFileName = FilenameUtils.getBaseName(zipFile.getOriginalFilename());
String fileExt = FilenameUtils.getExtension(zipFile.getOriginalFilename());
if (null == fileExt || !fileExt.equalsIgnoreCase("zip")) {
throw new ServiceException("请上传zip压缩包");
}
String fullUploadPath = getUploadPath() + File.separator + bizPath;
String uploadedZipPath = CommonUtils.uploadLocal(zipFile, bizPath, getUploadPath());
// 解压缩文件
List<AiragKnowledgeDoc> docList = new ArrayList<>();
AtomicInteger fileCount = new AtomicInteger(0);
unzipFile(getUploadPath() + File.separator + uploadedZipPath, sourcesPath, uploadedFile -> {
// 仅支持txt、pdf、docx、pptx、html、md文件
String fileName = uploadedFile.getName();
if (!SUPPORT_DOC_TYPE.contains(FilenameUtils.getExtension(fileName).toLowerCase())) {
log.warn("不支持的文件类型: {}", fileName);
return;
}
String baseName = FilenameUtils.getBaseName(fileName);
AiragKnowledgeDoc doc = new AiragKnowledgeDoc();
doc.setKnowledgeId(Long.valueOf(knowId));
doc.setTitle(baseName);
doc.setType(LLMConsts.KNOWLEDGE_DOC_TYPE_FILE);
doc.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);
String relativePath;
if (File.separator.equals("\")) {
// Windows
String escapedPath = getUploadPath().replace("//", "\\");
relativePath = uploadedFile.getPath().replaceFirst("^" + escapedPath, "");
} else {
// Linux
relativePath = uploadedFile.getPath().replaceFirst("^" + getUploadPath(), "");
}
JSONObject metadata = new JSONObject();
metadata.put(LLMConsts.KNOWLEDGE_DOC_METADATA_FILEPATH, relativePath);
metadata.put(LLMConsts.KNOWLEDGE_DOC_METADATA_SOURCES_PATH, sourcesPath);
doc.setMetadata(metadata.toJSONString());
docList.add(doc);
});
// 保存数据
this.saveBatch(docList);
// 重建文档
String docIds = docList.stream()
.map(doc -> String.valueOf(doc.getId()))
.filter(oConvertUtils::isObjectNotEmpty)
.collect(Collectors.joining(","));
rebuildDocument(docIds);
} catch (Exception e) {
throw new RuntimeException(e);
}
return true;
}
接口7:文档重新向量化
Controller层
@PutMapping(value = "/doc/rebuild")
public R<Void> rebuildDocument(@RequestParam("docIds") String docIds) {
return toAjax(airagKnowledgeDocService.rebuildDocument(docIds));
}
Service层
@Transactional(rollbackFor = Exception.class)
public Boolean rebuildDocument(String docIds) {
if (ObjectUtil.isEmpty(docIds)) {
throw new ServiceException("请选择重建的文档");
}
List<String> docIdList = Arrays.asList(docIds.split(","));
// 查询数据
List<AiragKnowledgeDoc> docList = airagKnowledgeDocMapper.selectBatchIds(docIdList);
if (ObjectUtil.isEmpty(docList)) {
throw new ServiceException("文档不存在");
}
HttpServletRequest request = ServletUtils.getRequest();
if (request instanceof XssHttpServletRequestWrapper) {
request = (HttpServletRequest) ((XssHttpServletRequestWrapper) request).getRequest();
}
String baseUrl = "";
// 检查状态
List<AiragKnowledgeDoc> knowledgeDocs = docList.stream()
.filter(doc -> {
// ai知识库 上传完文档 一直显示构建中
if (KNOWLEDGE_DOC_STATUS_BUILDING.equalsIgnoreCase(doc.getStatus())) {
Date updateTime = doc.getUpdateTime();
if (updateTime != null) {
// 向量化超过了5分钟,重新向量化
long timeDifference = System.currentTimeMillis() - updateTime.getTime();
return timeDifference > 5 * 60 * 1000;
} else {
return true;
}
} else {
return true;
}
})
.peek(doc -> {
doc.setStatus(KNOWLEDGE_DOC_STATUS_BUILDING);
doc.setBaseUrl(baseUrl);
})
.collect(Collectors.toList());
if (ObjectUtil.isEmpty(knowledgeDocs)) {
return true;
}
if (oConvertUtils.isObjectEmpty(knowledgeDocs)) {
return true;
}
// 更新状态
this.updateBatchById(knowledgeDocs);
// 异步重建文档
knowledgeDocs.forEach((doc) -> {
CompletableFuture.runAsync(() -> {
String knowId = String.valueOf(doc.getKnowledgeId());
doc.setStatus(KNOWLEDGE_DOC_STATUS_BUILDING);
this.updateById(doc);
// ai知识库 上传完文档 一直显示构建中
try {
Map<String, Object> metadata = embeddingHandler.embeddingDocument(knowId, doc);
// 更新数据
if (null != metadata) {
doc.setStatus(KNOWLEDGE_DOC_STATUS_COMPLETE);
this.updateById(doc);
} else {
doc.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);
this.updateById(doc);
}
} catch (Throwable t) {
doc.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);
this.updateById(doc);
}
}, buildDocExecutorService);
});
return true;
}
注:此处用到异步线程,简历上可加分!
接口8:命中测试
Controller层
@GetMapping(value = "/embedding/hitTest/{knowId}")
public R<List<Map<String, Object>>> hitTest(@PathVariable("knowId") String knowId,
@RequestParam(name = "queryText") String queryText,
@RequestParam(name = "topNumber") Integer topNumber,
@RequestParam(name = "similarity") Double similarity) {
return R.ok(airagKnowledgeDocService.hitTest(knowId, queryText, topNumber, similarity));
}
Service层
@Override
public List<Map<String, Object>> hitTest(String knowId, String queryText, Integer topNumber, Double similarity) {
List<Map<String, Object>> searchResp = embeddingHandler.searchEmbedding(knowId, queryText, topNumber, similarity);
return searchResp;
}
至此,通过上述流程已实现知识库搭建!
© 版权声明
文章版权归作者所有,未经允许请勿转载。
相关文章
暂无评论...


