""" Hugging Face Space 应用:在 HF Space 上运行 Weaviate 索引构建 默认使用 HF 免费 embedding(sentence-transformers),直接上传到 Weaviate Cloud """ import os import gradio as gr from pathlib import Path import threading import time # 从环境变量读取配置(HF Space Secrets) OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip() WEAVIATE_URL = os.getenv("WEAVIATE_URL", "").strip() WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY", "").strip() WEAVIATE_COLLECTION = os.getenv("WEAVIATE_COLLECTION", "GenAICourses").strip() EMBEDDING_PROVIDER = os.getenv("EMBEDDING_PROVIDER", "huggingface").strip().lower() # 课程文档路径(需要上传到 HF Space) SCRIPT_DIR = Path(__file__).resolve().parent COURSES_DIR = SCRIPT_DIR / "GENAI COURSES" # 全局状态 build_status = {"running": False, "progress": "", "error": None, "result": None} def build_index_worker(clear_first: bool, progress_callback=None): """后台工作线程:构建索引""" global build_status try: build_status["running"] = True build_status["error"] = None build_status["progress"] = "开始构建索引..." # 检查配置 if EMBEDDING_PROVIDER == "openai" and not OPENAI_API_KEY: raise RuntimeError("使用 OpenAI embedding 时请在 Settings → Secrets 中添加 OPENAI_API_KEY") if not WEAVIATE_URL or not WEAVIATE_API_KEY: raise RuntimeError("请在 HF Space Settings → Secrets 中添加 WEAVIATE_URL 和 WEAVIATE_API_KEY") # 检查课程目录 if not COURSES_DIR.exists(): raise FileNotFoundError( f"课程目录不存在:{COURSES_DIR}\n" "请将 GENAI COURSES 文件夹上传到 Space 的根目录" ) # 导入依赖 build_status["progress"] = "加载依赖库..." from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings from llama_index.core import StorageContext from llama_index.vector_stores.weaviate import WeaviateVectorStore import weaviate from weaviate.classes.init import Auth # 设置 embedding build_status["progress"] = "配置 embedding 模型..." if EMBEDDING_PROVIDER == "openai": from llama_index.embeddings.openai import OpenAIEmbedding Settings.embed_model = OpenAIEmbedding( model="text-embedding-3-small", api_key=OPENAI_API_KEY, ) else: from llama_index.embeddings.huggingface import HuggingFaceEmbedding Settings.embed_model = HuggingFaceEmbedding( model_name="sentence-transformers/all-MiniLM-L6-v2" ) # 连接 Weaviate build_status["progress"] = "连接 Weaviate Cloud..." url = WEAVIATE_URL if not url.startswith("http"): url = "https://" + url client = weaviate.connect_to_weaviate_cloud( cluster_url=url, auth_credentials=Auth.api_key(WEAVIATE_API_KEY), ) if not client.is_ready(): raise RuntimeError("Weaviate 连接失败") try: # 清空旧 collection(如果需要) if clear_first: build_status["progress"] = f"删除旧 collection: {WEAVIATE_COLLECTION}..." try: if hasattr(client.collections, "delete"): client.collections.delete(WEAVIATE_COLLECTION) build_status["progress"] = "旧 collection 已删除" except Exception as e: if "404" not in str(e) and "not found" not in str(e).lower(): build_status["progress"] = f"删除旧 collection 时警告: {e}" # 读取文档 build_status["progress"] = f"读取课程目录: {COURSES_DIR}..." reader = SimpleDirectoryReader( input_dir=str(COURSES_DIR), recursive=True, required_exts=[".md", ".pdf", ".txt", ".py", ".ipynb", ".docx"], ) documents = reader.load_data() build_status["progress"] = f"已加载 {len(documents)} 个文档块" # 创建 vector store build_status["progress"] = "创建 Weaviate vector store..." vector_store = WeaviateVectorStore( weaviate_client=client, index_name=WEAVIATE_COLLECTION, ) storage_context = StorageContext.from_defaults(vector_store=vector_store) # 构建索引(这会自动进行 embedding 并上传) build_status["progress"] = f"正在 embedding 并上传到 Weaviate (collection={WEAVIATE_COLLECTION})...\n这可能需要几分钟时间,请耐心等待..." index = VectorStoreIndex.from_documents( documents, storage_context=storage_context, ) # 等待 batch 提交完成 time.sleep(3) # 验证 build_status["progress"] = "验证索引..." coll = client.collections.get(WEAVIATE_COLLECTION) agg = coll.aggregate.over_all(total_count=True) n = agg.total_count build_status["result"] = f"✅ 索引构建成功!\n当前 object count = {n}" build_status["progress"] = build_status["result"] finally: client.close() except Exception as e: build_status["error"] = str(e) build_status["progress"] = f"❌ 错误: {str(e)}" finally: build_status["running"] = False def start_build(clear_first: bool): """启动索引构建""" global build_status if build_status["running"]: return "⚠️ 索引构建正在进行中,请等待完成..." # 重置状态 build_status = {"running": False, "progress": "", "error": None, "result": None} # 启动后台线程 thread = threading.Thread( target=build_index_worker, args=(clear_first,), daemon=True ) thread.start() return "🚀 索引构建已启动,请查看下方进度..." def get_progress(): """获取当前进度""" if build_status["running"]: return build_status["progress"] or "处理中..." elif build_status["error"]: return f"❌ 错误: {build_status['error']}" elif build_status["result"]: return build_status["result"] else: return "等待开始..." # Gradio 界面 with gr.Blocks(title="Weaviate 索引构建工具") as app: gr.Markdown(""" # 🔍 Weaviate 索引构建工具 在 Hugging Face Space 上使用 **免费** Hugging Face embedding,并直接上传到 Weaviate Cloud。 ## 配置要求 请在 **Settings → Secrets** 中添加以下环境变量: - `WEAVIATE_URL`: Weaviate Cloud REST 地址 - `WEAVIATE_API_KEY`: Weaviate API Key - `WEAVIATE_COLLECTION`: Collection 名称(默认: GenAICourses) - `EMBEDDING_PROVIDER`: huggingface(免费,默认)或 openai(需 OPENAI_API_KEY) ## 使用步骤 1. 确保已将 `GENAI COURSES` 文件夹上传到 Space 根目录 2. 点击下方按钮开始构建索引 3. 等待构建完成(可能需要几分钟) """) with gr.Row(): clear_first = gr.Checkbox( label="清空旧索引后重建", value=True, info="如果勾选,会先删除旧的 collection 再重建" ) build_btn = gr.Button("🚀 开始构建索引", variant="primary", size="lg") progress_output = gr.Textbox( label="构建进度", lines=10, interactive=False, value="等待开始..." ) # 初始加载时显示进度 app.load( fn=get_progress, inputs=[], outputs=progress_output, ) build_btn.click( fn=start_build, inputs=[clear_first], outputs=progress_output, ) if __name__ == "__main__": app.launch()