1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
| class DatasetManager: def __init__(self, api_key: str, base_url: str): """初始化RAGFlow客户端""" self.rag_object = RAGFlow(api_key=api_key, base_url=base_url)
def create_dataset_example(self): """创建数据集的示例""" print("=== 创建数据集示例 ===") created_datasets = [] chunk_methods = [ "naive", "manual", "qa", "table", "paper", "book", "laws", "presentation", "picture", "one", "email", ] for method in chunk_methods: try: dataset = self.rag_object.create_dataset( name=f"kb_{method}_{int(time.time())}", description=f"使用 {method} 分块方法的数据集", chunk_method=method, embedding_model="BAAI/bge-large-zh-v1.5@BAAI", permission="me", ) print(f"创建{method}数据集: {dataset.name} (ID: {dataset.id})") created_datasets.append(dataset) except Exception as e: print(f"创建{method}数据集失败: {e}") return created_datasets
def _get_parser_config(self, chunk_method: str) -> Optional[Any]: """根据分块方法获取解析器配置""" configs = { "naive": { "chunk_token_num": 512, "delimiter": "\n", "html4excel": False, "layout_recognize": True, "raptor": {"use_raptor": False}, }, "qa": {"raptor": {"use_raptor": False}}, "manual": {"raptor": {"use_raptor": False}}, "table": None, "paper": {"raptor": {"use_raptor": False}}, "book": {"raptor": {"use_raptor": False}}, "laws": {"raptor": {"use_raptor": False}}, "picture": None, "presentation": {"raptor": {"use_raptor": False}}, "one": None, "knowledge-graph": { "chunk_token_num": 128, "delimiter": "\n", "entity_types": ["organization", "person", "location", "event", "time"], }, "email": None, } return configs.get(chunk_method)
def list_datasets_example(self): """列出数据集的示例""" print("\n=== 列出数据集示例 ===") all_datasets = self.rag_object.list_datasets() print(f"总共有{len(all_datasets)} 个数据集:") for dataset in all_datasets: print( f" - {dataset.name} (ID: {dataset.id}, 创建时间: {getattr(dataset, 'create_time', 'N/A')})" )
page_datasets = self.rag_object.list_datasets(page=1, page_size=5) print(f"\n第一页数据集 (每页5个):") for dataset in page_datasets: print(f" - {dataset.name}") time_sorted_datasets = self.rag_object.list_datasets( orderby="update_time", desc=True ) print("\n按更新时间排序的数据集:") for dataset in time_sorted_datasets[:3]: print( f" - {dataset.name} (更新时间: {getattr(dataset, 'update_time', 'N/A')})" ) print("\n=== 遍历所有数据集,检查权限 ===") has_permission = [] no_permission = [] for dataset in all_datasets: try: self.rag_object.list_datasets(id=dataset.id) has_permission.append(dataset.name) except Exception as e: no_permission.append(dataset.name) print(f"\n当前用户有权限的数据集: {has_permission}") print(f"当前用户无权限的数据集: {no_permission}")
def update_dataset_example(self, dataset_id: str): """更新数据集的示例""" print(f"\n=== 更新数据集示例 (ID: {dataset_id}) ===") try: update_message = { "name": "updated_kb", "description": "已更新的知识库数据集", "embedding_model": "text-embedding-v2", } datasets = self.rag_object.list_datasets(id=dataset_id) if datasets: dataset = datasets[0] dataset.update(update_message) print(f"数据集更新成功: {dataset.name}") updated_datasets = self.rag_object.list_datasets(id=dataset_id) if updated_datasets: updated_dataset = updated_datasets[0] print(f"更新后的数据集信息:") print(f" 名称: {updated_dataset.name}") print(f" 描述: {updated_dataset.description}") print(f" 嵌入模型: {updated_dataset.embedding_model}") else: print(f"未找到ID为 {dataset_id} 的数据集") except Exception as e: print(f"更新数据集失败: {e}")
def delete_datasets_example(self, dataset_ids: List[str]): """删除数据集的示例""" print(f"\n=== 删除数据集示例 ===") if dataset_ids: try: self.rag_object.delete_datasets(ids=dataset_ids) print(f"成功删除 {len(dataset_ids)} 个数据集") except Exception as e: print(f"删除数据集失败: {e}") try: self.rag_object.delete_datasets(ids=[]) print("空列表删除操作完成(未删除任何数据集)") except Exception as e: print(f"空列表删除操作失败: {e}")
def main(): """主函数 - 演示数据集管理的完整流程""" API_KEY = "ragflow-ZjMDYyNDQwNjIxNzExZjBhMjgyMDI0Mm" BASE_URL = "http://localhost:9380" dataset_manager = DatasetManager(API_KEY, BASE_URL) try: created_datasets = dataset_manager.create_dataset_example()
dataset_manager.list_datasets_example()
except Exception as e: print(f"执行过程中发生错误: {e}")
if __name__ == "__main__": main()
|