GenAICoursesDB / app.py
claudqunwang's picture
Default to HF free embedding (huggingface); require OPENAI_API_KEY only when using openai
e9593ee
"""
Hugging Face Space 应用:在 HF Space 上运行 Weaviate 索引构建
默认使用 HF 免费 embedding(sentence-transformers),直接上传到 Weaviate Cloud
"""
import os
import gradio as gr
from pathlib import Path
import threading
import time
# 从环境变量读取配置(HF Space Secrets)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
WEAVIATE_URL = os.getenv("WEAVIATE_URL", "").strip()
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY", "").strip()
WEAVIATE_COLLECTION = os.getenv("WEAVIATE_COLLECTION", "GenAICourses").strip()
EMBEDDING_PROVIDER = os.getenv("EMBEDDING_PROVIDER", "huggingface").strip().lower()
# 课程文档路径(需要上传到 HF Space)
SCRIPT_DIR = Path(__file__).resolve().parent
COURSES_DIR = SCRIPT_DIR / "GENAI COURSES"
# 全局状态
build_status = {"running": False, "progress": "", "error": None, "result": None}
def build_index_worker(clear_first: bool, progress_callback=None):
"""后台工作线程:构建索引"""
global build_status
try:
build_status["running"] = True
build_status["error"] = None
build_status["progress"] = "开始构建索引..."
# 检查配置
if EMBEDDING_PROVIDER == "openai" and not OPENAI_API_KEY:
raise RuntimeError("使用 OpenAI embedding 时请在 Settings → Secrets 中添加 OPENAI_API_KEY")
if not WEAVIATE_URL or not WEAVIATE_API_KEY:
raise RuntimeError("请在 HF Space Settings → Secrets 中添加 WEAVIATE_URL 和 WEAVIATE_API_KEY")
# 检查课程目录
if not COURSES_DIR.exists():
raise FileNotFoundError(
f"课程目录不存在:{COURSES_DIR}\n"
"请将 GENAI COURSES 文件夹上传到 Space 的根目录"
)
# 导入依赖
build_status["progress"] = "加载依赖库..."
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.core import StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
import weaviate
from weaviate.classes.init import Auth
# 设置 embedding
build_status["progress"] = "配置 embedding 模型..."
if EMBEDDING_PROVIDER == "openai":
from llama_index.embeddings.openai import OpenAIEmbedding
Settings.embed_model = OpenAIEmbedding(
model="text-embedding-3-small",
api_key=OPENAI_API_KEY,
)
else:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
Settings.embed_model = HuggingFaceEmbedding(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# 连接 Weaviate
build_status["progress"] = "连接 Weaviate Cloud..."
url = WEAVIATE_URL
if not url.startswith("http"):
url = "https://" + url
client = weaviate.connect_to_weaviate_cloud(
cluster_url=url,
auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)
if not client.is_ready():
raise RuntimeError("Weaviate 连接失败")
try:
# 清空旧 collection(如果需要)
if clear_first:
build_status["progress"] = f"删除旧 collection: {WEAVIATE_COLLECTION}..."
try:
if hasattr(client.collections, "delete"):
client.collections.delete(WEAVIATE_COLLECTION)
build_status["progress"] = "旧 collection 已删除"
except Exception as e:
if "404" not in str(e) and "not found" not in str(e).lower():
build_status["progress"] = f"删除旧 collection 时警告: {e}"
# 读取文档
build_status["progress"] = f"读取课程目录: {COURSES_DIR}..."
reader = SimpleDirectoryReader(
input_dir=str(COURSES_DIR),
recursive=True,
required_exts=[".md", ".pdf", ".txt", ".py", ".ipynb", ".docx"],
)
documents = reader.load_data()
build_status["progress"] = f"已加载 {len(documents)} 个文档块"
# 创建 vector store
build_status["progress"] = "创建 Weaviate vector store..."
vector_store = WeaviateVectorStore(
weaviate_client=client,
index_name=WEAVIATE_COLLECTION,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# 构建索引(这会自动进行 embedding 并上传)
build_status["progress"] = f"正在 embedding 并上传到 Weaviate (collection={WEAVIATE_COLLECTION})...\n这可能需要几分钟时间,请耐心等待..."
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
# 等待 batch 提交完成
time.sleep(3)
# 验证
build_status["progress"] = "验证索引..."
coll = client.collections.get(WEAVIATE_COLLECTION)
agg = coll.aggregate.over_all(total_count=True)
n = agg.total_count
build_status["result"] = f"✅ 索引构建成功!\n当前 object count = {n}"
build_status["progress"] = build_status["result"]
finally:
client.close()
except Exception as e:
build_status["error"] = str(e)
build_status["progress"] = f"❌ 错误: {str(e)}"
finally:
build_status["running"] = False
def start_build(clear_first: bool):
"""启动索引构建"""
global build_status
if build_status["running"]:
return "⚠️ 索引构建正在进行中,请等待完成..."
# 重置状态
build_status = {"running": False, "progress": "", "error": None, "result": None}
# 启动后台线程
thread = threading.Thread(
target=build_index_worker,
args=(clear_first,),
daemon=True
)
thread.start()
return "🚀 索引构建已启动,请查看下方进度..."
def get_progress():
"""获取当前进度"""
if build_status["running"]:
return build_status["progress"] or "处理中..."
elif build_status["error"]:
return f"❌ 错误: {build_status['error']}"
elif build_status["result"]:
return build_status["result"]
else:
return "等待开始..."
# Gradio 界面
with gr.Blocks(title="Weaviate 索引构建工具") as app:
gr.Markdown("""
# 🔍 Weaviate 索引构建工具
在 Hugging Face Space 上使用 **免费** Hugging Face embedding,并直接上传到 Weaviate Cloud。
## 配置要求
请在 **Settings → Secrets** 中添加以下环境变量:
- `WEAVIATE_URL`: Weaviate Cloud REST 地址
- `WEAVIATE_API_KEY`: Weaviate API Key
- `WEAVIATE_COLLECTION`: Collection 名称(默认: GenAICourses)
- `EMBEDDING_PROVIDER`: huggingface(免费,默认)或 openai(需 OPENAI_API_KEY)
## 使用步骤
1. 确保已将 `GENAI COURSES` 文件夹上传到 Space 根目录
2. 点击下方按钮开始构建索引
3. 等待构建完成(可能需要几分钟)
""")
with gr.Row():
clear_first = gr.Checkbox(
label="清空旧索引后重建",
value=True,
info="如果勾选,会先删除旧的 collection 再重建"
)
build_btn = gr.Button("🚀 开始构建索引", variant="primary", size="lg")
progress_output = gr.Textbox(
label="构建进度",
lines=10,
interactive=False,
value="等待开始..."
)
# 初始加载时显示进度
app.load(
fn=get_progress,
inputs=[],
outputs=progress_output,
)
build_btn.click(
fn=start_build,
inputs=[clear_first],
outputs=progress_output,
)
if __name__ == "__main__":
app.launch()