教你使用langchain4j手撸一个知识库(实战篇)

内容分享3周前发布
0 0 0

重大的是:思路和方法

教你使用langchain4j手撸一个知识库(实战篇)

步骤一:表结构设计

知识库表:用于管理文档

CREATE TABLE `airag_knowledge` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `name` varchar(100) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '知识库名称',
  `descr` varchar(500) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '描述',
  `embed_id` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '向量模型id',
  `status` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '状态',
  `tenant_id` varchar(20) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '租户id',
  `create_dept` bigint DEFAULT NULL COMMENT '创建部门',
  `create_by` bigint DEFAULT NULL COMMENT '创建者',
  `create_time` datetime DEFAULT NULL COMMENT '创建时间',
  `update_by` bigint DEFAULT NULL COMMENT '更新者',
  `update_time` datetime DEFAULT NULL COMMENT '更新时间',
  `del_flag` char(1) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT '0' COMMENT '删除标志(0代表存在 2代表删除)',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 ROW_FORMAT=DYNAMIC COMMENT='知识库表';

文档表:用于存储文件信息

CREATE TABLE `airag_knowledge_doc` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `knowledge_id` bigint DEFAULT NULL COMMENT '知识库id',
  `title` varchar(100) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '标题',
  `type` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '类型',
  `content` text CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci COMMENT '内容',
  `status` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '状态',
  `metadata` text CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci COMMENT '元数据',
  `tenant_id` varchar(20) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '租户id',
  `create_dept` bigint DEFAULT NULL COMMENT '创建部门',
  `create_by` bigint DEFAULT NULL COMMENT '创建者',
  `create_time` datetime DEFAULT NULL COMMENT '创建时间',
  `update_by` bigint DEFAULT NULL COMMENT '更新者',
  `update_time` datetime DEFAULT NULL COMMENT '更新时间',
  `del_flag` char(1) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT '0' COMMENT '删除标志(0代表存在 2代表删除)',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 ROW_FORMAT=DYNAMIC COMMENT='知识库文档表';

模型配置表:用户切换模型、构建LLM、Embed模型

CREATE TABLE `airag_model` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `name` varchar(100) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '名称',
  `provider` varchar(50) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '供应者',
  `model_name` varchar(100) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '模型名称',
  `credential` varchar(500) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '凭证信息',
  `base_url` varchar(500) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT 'API域名',
  `model_type` varchar(32) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '模型类型',
  `model_params` varchar(500) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '模型参数',
  `tenant_id` varchar(20) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '租户id',
  `create_dept` bigint DEFAULT NULL COMMENT '创建部门',
  `create_by` bigint DEFAULT NULL COMMENT '创建者',
  `create_time` datetime DEFAULT NULL COMMENT '创建时间',
  `update_by` bigint DEFAULT NULL COMMENT '更新者',
  `update_time` datetime DEFAULT NULL COMMENT '更新时间',
  `del_flag` char(1) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT '0' COMMENT '删除标志(0代表存在 2代表删除)',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 ROW_FORMAT=DYNAMIC COMMENT='模型管理表';

向量库表:存储文档的向量数据

CREATE TABLE "public"."embeddings" (
  "embedding_id" uuid NOT NULL,
  "embedding" "public"."vector",
  "text" text COLLATE "pg_catalog"."default",
  "metadata" json,
  CONSTRAINT "vector_1_pkey" PRIMARY KEY ("embedding_id")
);

步骤二:知识库功能开发

接口1:分页查询知识库

Controller层

@GetMapping("/list")
public TableDataInfo<AiragKnowledgeVo> list(AiragKnowledgeBo bo, PageQuery pageQuery) {
  return airagKnowledgeService.queryPageList(bo, pageQuery);
}

Service层

@Override
public TableDataInfo<AiragKnowledgeVo> queryPageList(AiragKnowledgeBo bo, PageQuery pageQuery) {
    LambdaQueryWrapper<AiragKnowledge> lqw = buildQueryWrapper(bo);
    Page<AiragKnowledgeVo> result = airagKnowledgeMapper.selectVoPage(pageQuery.build(), lqw);
    return TableDataInfo.build(result);
}

接口2:知识库详情

Controller层

@GetMapping("/{id}")
public R<AiragKnowledgeVo> getInfo(@NotNull(message = "主键不能为空")
@PathVariable String id) {
return R.ok(airagKnowledgeService.queryById(id));
}

Service层

@Override
public AiragKnowledgeVo queryById(String id) {
  return airagKnowledgeMapper.selectVoById(id);
}

接口3:新增知识库

Controller层

@PostMapping("/add")
public R<Void> add(@Validated(AddGroup.class) @RequestBody AiragKnowledgeBo bo) {
  return toAjax(airagKnowledgeService.insertByBo(bo));
}

Service层

@Override
public Boolean insertByBo(AiragKnowledgeBo bo) {
    AiragKnowledge add = MapstructUtils.convert(bo, AiragKnowledge.class);
    add.setStatus(LLMConsts.STATUS_ENABLE);
    boolean flag = airagKnowledgeMapper.insert(add) > 0;
    if (flag) {
    	bo.setId(add.getId());
    }
    return flag;
}

接口4:修改知识库

Controller层

@Transactional(rollbackFor = Exception.class)
@PutMapping("/edit")
public R<Void> edit(@Validated(EditGroup.class) @RequestBody AiragKnowledgeBo bo) {
return toAjax(airagKnowledgeService.updateByBo(bo));
}

Service层

@Override
public Boolean updateByBo(AiragKnowledgeBo bo) {
    AiragKnowledge update = MapstructUtils.convert(bo, AiragKnowledge.class);
    AiragKnowledge airagKnowledgeEntity = airagKnowledgeMapper.selectById(update.getId());
    if (ObjectUtil.isEmpty(airagKnowledgeEntity)) {
    	throw new ServiceException("未找到对应数据");
    }
    String oldEmbedId = airagKnowledgeEntity.getEmbedId();
    int updated = airagKnowledgeMapper.updateById(update);
    if (updated < 0) {
    	return false;
    }
    // 如果向量模型更改,需要重建
    if (!oldEmbedId.equalsIgnoreCase(update.getEmbedId())) {
        // 更新了模型,重建文档
        iairagKnowledgeDocService.rebuildDocumentByKnowId(String.valueOf(update.getId()));
    }
    return true;
}

接口5:批量删除知识库

Controller层

@DeleteMapping("/{ids}")
public R<Void> remove(@NotEmpty(message = "主键不能为空")
@PathVariable String[] ids) {
  return toAjax(airagKnowledgeService.deleteWithValidByIds(List.of(ids), true));
}

Service层

@Override
public Boolean deleteWithValidByIds(List<String> ids, Boolean isValid) {
    iairagKnowledgeDocService.removeByKnowIds(ids);
    airagKnowledgeMapper.deleteByIds(ids);
    return true;
}

注:此处在删除知识库之前,同时也要删除该库存在的文档,且对应的向量数据也要删除!

步骤三:文档功能开发

接口1:文档分页查询

Controller层

@GetMapping("/doc/list")
public TableDataInfo<AiragKnowledgeDocVo> list(AiragKnowledgeDocBo bo, PageQuery pageQuery) {
  return airagKnowledgeDocService.queryPageList(bo, pageQuery);
}

Service层

@Override
public TableDataInfo<AiragKnowledgeDocVo> queryPageList(AiragKnowledgeDocBo bo, PageQuery pageQuery) {
    if (ObjectUtil.isEmpty(bo.getKnowledgeId())) {
    	throw new ServiceException("请先选择知识库");
    }
    LambdaQueryWrapper<AiragKnowledgeDoc> lqw = buildQueryWrapper(bo);
    Page<AiragKnowledgeDocVo> result = airagKnowledgeDocMapper.selectVoPage(pageQuery.build(), lqw);
    return TableDataInfo.build(result);
}

接口2:文档详情

Controller层

@GetMapping("/{id}")
public R<AiragKnowledgeDocVo> getInfo(@NotNull(message = "主键不能为空")
@PathVariable String id) {
  return R.ok(airagKnowledgeDocService.queryById(id));
}

Service层

@Override
public AiragKnowledgeDocVo queryById(String id) {
  return airagKnowledgeDocMapper.selectVoById(id);
}

接口3:新增文档

Controller层

@PostMapping("/add")
public R<Void> add(@Validated(AddGroup.class) @RequestBody AiragKnowledgeDocBo bo) {
  return toAjax(airagKnowledgeDocService.insertByBo(bo));
}

Service层

@Transactional(rollbackFor = {Exception.class})
@Override
public Boolean insertByBo(AiragKnowledgeDocBo bo) {
    AiragKnowledgeDoc add = MapstructUtils.convert(bo, AiragKnowledgeDoc.class);
    validEntityBeforeSave(add);
    add.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);
    boolean flag = airagKnowledgeDocMapper.insert(add) > 0;
    if (flag) {
        // 重建向量
        return rebuildDocument(String.valueOf(add.getId()));
    } else {
    	return false;
    }
}

注:新增文档需要向量化,存入postgres库中,方便后期查询、向量检索

接口4:修改文档

Controller层

@PutMapping("/edit")
public R<Void> edit(@Validated(EditGroup.class) @RequestBody AiragKnowledgeDocBo bo) {
  return toAjax(airagKnowledgeDocService.updateByBo(bo));
}

Service层

@Transactional(rollbackFor = {Exception.class})
@Override
public Boolean updateByBo(AiragKnowledgeDocBo bo) {
    AiragKnowledgeDoc update = MapstructUtils.convert(bo, AiragKnowledgeDoc.class);
    validEntityBeforeSave(update);
    update.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);
    boolean flag = airagKnowledgeDocMapper.updateById(update) > 0;
    if (flag) {
        // 重建向量
        return rebuildDocument(String.valueOf(update.getId()));
    } else {
    	return false;
    }
}

注:编辑向量化同理!

接口5:批量删除文档

Controller层

@Transactional(rollbackFor = Exception.class)
@DeleteMapping("/{ids}")
public R<Void> remove(@NotEmpty(message = "主键不能为空")
@PathVariable String[] ids) {
  return toAjax(airagKnowledgeDocService.deleteWithValidByIds(List.of(ids), true));
}

Service层

@Override
public Boolean deleteWithValidByIds(List<String> ids, Boolean isValid) {
  return airagKnowledgeDocMapper.deleteByIds(ids) > 0;
}

接口6:压缩包导入文档

Controller层

@PostMapping(value = "/doc/import/zip")
public R<Void> importDocumentFromZip(@RequestParam(name = "knowId", required = true) String knowId,
@RequestParam(name = "file", required = true) MultipartFile file) {
  return toAjax(airagKnowledgeDocService.importDocumentFromZip(knowId, file));
}

Service层

@Transactional(rollbackFor = {Exception.class})
@Override
public Boolean importDocumentFromZip(String knowId, MultipartFile zipFile) {
    if (ObjectUtil.isEmpty(knowId)) {
    	throw new ServiceException("请先选择知识库");
    }
    if (ObjectUtil.isEmpty(zipFile)) {
    	throw new ServiceException("请上传文件");
    }
    long startTime = System.currentTimeMillis();


    try {
        String bizPath = knowId + File.separator + UUIDGenerator.generate();
        String workDir = getUploadPath() + File.separator + bizPath + File.separator;
        String sourcesPath = workDir + "files";


        SsrfFileTypeFilter.checkUploadFileType(zipFile);
        // 通过filePath 检查文件是不是压缩包(zip)
        String zipFileName = FilenameUtils.getBaseName(zipFile.getOriginalFilename());
        String fileExt = FilenameUtils.getExtension(zipFile.getOriginalFilename());
        if (null == fileExt || !fileExt.equalsIgnoreCase("zip")) {
        	throw new ServiceException("请上传zip压缩包");
        }
        String fullUploadPath = getUploadPath() + File.separator + bizPath;
        String uploadedZipPath = CommonUtils.uploadLocal(zipFile, bizPath, getUploadPath());
        // 解压缩文件
        List<AiragKnowledgeDoc> docList = new ArrayList<>();
        AtomicInteger fileCount = new AtomicInteger(0);
        unzipFile(getUploadPath() + File.separator + uploadedZipPath, sourcesPath, uploadedFile -> {
            // 仅支持txt、pdf、docx、pptx、html、md文件
            String fileName = uploadedFile.getName();
            if (!SUPPORT_DOC_TYPE.contains(FilenameUtils.getExtension(fileName).toLowerCase())) {
                log.warn("不支持的文件类型: {}", fileName);
                return;
            }
            String baseName = FilenameUtils.getBaseName(fileName);
            AiragKnowledgeDoc doc = new AiragKnowledgeDoc();
            doc.setKnowledgeId(Long.valueOf(knowId));
            doc.setTitle(baseName);
            doc.setType(LLMConsts.KNOWLEDGE_DOC_TYPE_FILE);
            doc.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);


            String relativePath;
            if (File.separator.equals("\")) {
                // Windows
                String escapedPath = getUploadPath().replace("//", "\\");
                relativePath = uploadedFile.getPath().replaceFirst("^" + escapedPath, "");
            } else {
                // Linux
                relativePath = uploadedFile.getPath().replaceFirst("^" + getUploadPath(), "");
            }
            JSONObject metadata = new JSONObject();
            metadata.put(LLMConsts.KNOWLEDGE_DOC_METADATA_FILEPATH, relativePath);
            metadata.put(LLMConsts.KNOWLEDGE_DOC_METADATA_SOURCES_PATH, sourcesPath);
            doc.setMetadata(metadata.toJSONString());
            docList.add(doc);
        });
        // 保存数据
        this.saveBatch(docList);
        // 重建文档
        String docIds = docList.stream()
            .map(doc -> String.valueOf(doc.getId()))
            .filter(oConvertUtils::isObjectNotEmpty)
            .collect(Collectors.joining(","));
        rebuildDocument(docIds);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    return true;
}

接口7:文档重新向量化

Controller层

@PutMapping(value = "/doc/rebuild")
public R<Void> rebuildDocument(@RequestParam("docIds") String docIds) {
  return toAjax(airagKnowledgeDocService.rebuildDocument(docIds));
}

Service层

@Transactional(rollbackFor = Exception.class)
public Boolean rebuildDocument(String docIds) {
    if (ObjectUtil.isEmpty(docIds)) {
    	throw new ServiceException("请选择重建的文档");
    }
    List<String> docIdList = Arrays.asList(docIds.split(","));
    // 查询数据
    List<AiragKnowledgeDoc> docList = airagKnowledgeDocMapper.selectBatchIds(docIdList);
    if (ObjectUtil.isEmpty(docList)) {
    	throw new ServiceException("文档不存在");
    }
    HttpServletRequest request = ServletUtils.getRequest();
    if (request instanceof XssHttpServletRequestWrapper) {
    	request = (HttpServletRequest) ((XssHttpServletRequestWrapper) request).getRequest();
    }
    String baseUrl = "";
    // 检查状态
    List<AiragKnowledgeDoc> knowledgeDocs = docList.stream()
        .filter(doc -> {
            // ai知识库 上传完文档 一直显示构建中
            if (KNOWLEDGE_DOC_STATUS_BUILDING.equalsIgnoreCase(doc.getStatus())) {
            	Date updateTime = doc.getUpdateTime();
                if (updateTime != null) {
                    // 向量化超过了5分钟,重新向量化
                    long timeDifference = System.currentTimeMillis() - updateTime.getTime();
                    return timeDifference > 5 * 60 * 1000;
                } else {
                	return true;
                }
                } else {
                	return true;
                }
        })
        .peek(doc -> {
            doc.setStatus(KNOWLEDGE_DOC_STATUS_BUILDING);
            doc.setBaseUrl(baseUrl);
        })
        .collect(Collectors.toList());
    if (ObjectUtil.isEmpty(knowledgeDocs)) {
    	return true;
    }
    if (oConvertUtils.isObjectEmpty(knowledgeDocs)) {
    	return true;
    }
    // 更新状态
    this.updateBatchById(knowledgeDocs);
    // 异步重建文档


    knowledgeDocs.forEach((doc) -> {
        CompletableFuture.runAsync(() -> {
            String knowId = String.valueOf(doc.getKnowledgeId());
            doc.setStatus(KNOWLEDGE_DOC_STATUS_BUILDING);
            this.updateById(doc);
            // ai知识库 上传完文档 一直显示构建中
            try {
                Map<String, Object> metadata = embeddingHandler.embeddingDocument(knowId, doc);
                // 更新数据
                if (null != metadata) {
                    doc.setStatus(KNOWLEDGE_DOC_STATUS_COMPLETE);
                    this.updateById(doc);
                } else {
                    doc.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);
                    this.updateById(doc);
                }
            } catch (Throwable t) {
                doc.setStatus(KNOWLEDGE_DOC_STATUS_DRAFT);
                this.updateById(doc);
            }
        }, buildDocExecutorService);
    });
    return true;
}

注:此处用到异步线程,简历上可加分!

接口8:命中测试

Controller层

@GetMapping(value = "/embedding/hitTest/{knowId}")
public R<List<Map<String, Object>>> hitTest(@PathVariable("knowId") String knowId,
    @RequestParam(name = "queryText") String queryText,
    @RequestParam(name = "topNumber") Integer topNumber,
    @RequestParam(name = "similarity") Double similarity) {
    	return R.ok(airagKnowledgeDocService.hitTest(knowId, queryText, topNumber, similarity));
}

Service层

@Override
public List<Map<String, Object>> hitTest(String knowId, String queryText, Integer topNumber, Double similarity) {
    List<Map<String, Object>> searchResp = embeddingHandler.searchEmbedding(knowId, queryText, topNumber, similarity);
    return searchResp;
}

至此,通过上述流程已实现知识库搭建!

© 版权声明

相关文章

暂无评论

您必须登录才能参与评论!
立即登录
none
暂无评论...