Spaces:
Runtime error
Runtime error
| """ | |
| Hugging Face Space 应用:在 HF Space 上运行 Weaviate 索引构建 | |
| 默认使用 HF 免费 embedding(sentence-transformers),直接上传到 Weaviate Cloud | |
| """ | |
| import os | |
| import gradio as gr | |
| from pathlib import Path | |
| import threading | |
| import time | |
| # 从环境变量读取配置(HF Space Secrets) | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip() | |
| WEAVIATE_URL = os.getenv("WEAVIATE_URL", "").strip() | |
| WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY", "").strip() | |
| WEAVIATE_COLLECTION = os.getenv("WEAVIATE_COLLECTION", "GenAICourses").strip() | |
| EMBEDDING_PROVIDER = os.getenv("EMBEDDING_PROVIDER", "huggingface").strip().lower() | |
| # 课程文档路径(需要上传到 HF Space) | |
| SCRIPT_DIR = Path(__file__).resolve().parent | |
| COURSES_DIR = SCRIPT_DIR / "GENAI COURSES" | |
| # 全局状态 | |
| build_status = {"running": False, "progress": "", "error": None, "result": None} | |
| def build_index_worker(clear_first: bool, progress_callback=None): | |
| """后台工作线程:构建索引""" | |
| global build_status | |
| try: | |
| build_status["running"] = True | |
| build_status["error"] = None | |
| build_status["progress"] = "开始构建索引..." | |
| # 检查配置 | |
| if EMBEDDING_PROVIDER == "openai" and not OPENAI_API_KEY: | |
| raise RuntimeError("使用 OpenAI embedding 时请在 Settings → Secrets 中添加 OPENAI_API_KEY") | |
| if not WEAVIATE_URL or not WEAVIATE_API_KEY: | |
| raise RuntimeError("请在 HF Space Settings → Secrets 中添加 WEAVIATE_URL 和 WEAVIATE_API_KEY") | |
| # 检查课程目录 | |
| if not COURSES_DIR.exists(): | |
| raise FileNotFoundError( | |
| f"课程目录不存在:{COURSES_DIR}\n" | |
| "请将 GENAI COURSES 文件夹上传到 Space 的根目录" | |
| ) | |
| # 导入依赖 | |
| build_status["progress"] = "加载依赖库..." | |
| from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings | |
| from llama_index.core import StorageContext | |
| from llama_index.vector_stores.weaviate import WeaviateVectorStore | |
| import weaviate | |
| from weaviate.classes.init import Auth | |
| # 设置 embedding | |
| build_status["progress"] = "配置 embedding 模型..." | |
| if EMBEDDING_PROVIDER == "openai": | |
| from llama_index.embeddings.openai import OpenAIEmbedding | |
| Settings.embed_model = OpenAIEmbedding( | |
| model="text-embedding-3-small", | |
| api_key=OPENAI_API_KEY, | |
| ) | |
| else: | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| Settings.embed_model = HuggingFaceEmbedding( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| # 连接 Weaviate | |
| build_status["progress"] = "连接 Weaviate Cloud..." | |
| url = WEAVIATE_URL | |
| if not url.startswith("http"): | |
| url = "https://" + url | |
| client = weaviate.connect_to_weaviate_cloud( | |
| cluster_url=url, | |
| auth_credentials=Auth.api_key(WEAVIATE_API_KEY), | |
| ) | |
| if not client.is_ready(): | |
| raise RuntimeError("Weaviate 连接失败") | |
| try: | |
| # 清空旧 collection(如果需要) | |
| if clear_first: | |
| build_status["progress"] = f"删除旧 collection: {WEAVIATE_COLLECTION}..." | |
| try: | |
| if hasattr(client.collections, "delete"): | |
| client.collections.delete(WEAVIATE_COLLECTION) | |
| build_status["progress"] = "旧 collection 已删除" | |
| except Exception as e: | |
| if "404" not in str(e) and "not found" not in str(e).lower(): | |
| build_status["progress"] = f"删除旧 collection 时警告: {e}" | |
| # 读取文档 | |
| build_status["progress"] = f"读取课程目录: {COURSES_DIR}..." | |
| reader = SimpleDirectoryReader( | |
| input_dir=str(COURSES_DIR), | |
| recursive=True, | |
| required_exts=[".md", ".pdf", ".txt", ".py", ".ipynb", ".docx"], | |
| ) | |
| documents = reader.load_data() | |
| build_status["progress"] = f"已加载 {len(documents)} 个文档块" | |
| # 创建 vector store | |
| build_status["progress"] = "创建 Weaviate vector store..." | |
| vector_store = WeaviateVectorStore( | |
| weaviate_client=client, | |
| index_name=WEAVIATE_COLLECTION, | |
| ) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| # 构建索引(这会自动进行 embedding 并上传) | |
| build_status["progress"] = f"正在 embedding 并上传到 Weaviate (collection={WEAVIATE_COLLECTION})...\n这可能需要几分钟时间,请耐心等待..." | |
| index = VectorStoreIndex.from_documents( | |
| documents, | |
| storage_context=storage_context, | |
| ) | |
| # 等待 batch 提交完成 | |
| time.sleep(3) | |
| # 验证 | |
| build_status["progress"] = "验证索引..." | |
| coll = client.collections.get(WEAVIATE_COLLECTION) | |
| agg = coll.aggregate.over_all(total_count=True) | |
| n = agg.total_count | |
| build_status["result"] = f"✅ 索引构建成功!\n当前 object count = {n}" | |
| build_status["progress"] = build_status["result"] | |
| finally: | |
| client.close() | |
| except Exception as e: | |
| build_status["error"] = str(e) | |
| build_status["progress"] = f"❌ 错误: {str(e)}" | |
| finally: | |
| build_status["running"] = False | |
| def start_build(clear_first: bool): | |
| """启动索引构建""" | |
| global build_status | |
| if build_status["running"]: | |
| return "⚠️ 索引构建正在进行中,请等待完成..." | |
| # 重置状态 | |
| build_status = {"running": False, "progress": "", "error": None, "result": None} | |
| # 启动后台线程 | |
| thread = threading.Thread( | |
| target=build_index_worker, | |
| args=(clear_first,), | |
| daemon=True | |
| ) | |
| thread.start() | |
| return "🚀 索引构建已启动,请查看下方进度..." | |
| def get_progress(): | |
| """获取当前进度""" | |
| if build_status["running"]: | |
| return build_status["progress"] or "处理中..." | |
| elif build_status["error"]: | |
| return f"❌ 错误: {build_status['error']}" | |
| elif build_status["result"]: | |
| return build_status["result"] | |
| else: | |
| return "等待开始..." | |
| # Gradio 界面 | |
| with gr.Blocks(title="Weaviate 索引构建工具") as app: | |
| gr.Markdown(""" | |
| # 🔍 Weaviate 索引构建工具 | |
| 在 Hugging Face Space 上使用 **免费** Hugging Face embedding,并直接上传到 Weaviate Cloud。 | |
| ## 配置要求 | |
| 请在 **Settings → Secrets** 中添加以下环境变量: | |
| - `WEAVIATE_URL`: Weaviate Cloud REST 地址 | |
| - `WEAVIATE_API_KEY`: Weaviate API Key | |
| - `WEAVIATE_COLLECTION`: Collection 名称(默认: GenAICourses) | |
| - `EMBEDDING_PROVIDER`: huggingface(免费,默认)或 openai(需 OPENAI_API_KEY) | |
| ## 使用步骤 | |
| 1. 确保已将 `GENAI COURSES` 文件夹上传到 Space 根目录 | |
| 2. 点击下方按钮开始构建索引 | |
| 3. 等待构建完成(可能需要几分钟) | |
| """) | |
| with gr.Row(): | |
| clear_first = gr.Checkbox( | |
| label="清空旧索引后重建", | |
| value=True, | |
| info="如果勾选,会先删除旧的 collection 再重建" | |
| ) | |
| build_btn = gr.Button("🚀 开始构建索引", variant="primary", size="lg") | |
| progress_output = gr.Textbox( | |
| label="构建进度", | |
| lines=10, | |
| interactive=False, | |
| value="等待开始..." | |
| ) | |
| # 初始加载时显示进度 | |
| app.load( | |
| fn=get_progress, | |
| inputs=[], | |
| outputs=progress_output, | |
| ) | |
| build_btn.click( | |
| fn=start_build, | |
| inputs=[clear_first], | |
| outputs=progress_output, | |
| ) | |
| if __name__ == "__main__": | |
| app.launch() | |