Spaces:
Running
Running
GitHub Actions Bot
commited on
Commit
·
1ea875f
0
Parent(s):
deploy: auto-inject hf config & sync
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .env.example +93 -0
- .github/workflows/sync_to_hub.yml +58 -0
- .gitignore +43 -0
- Dockerfile +45 -0
- LICENSE +21 -0
- README.md +224 -0
- README_zh.md +212 -0
- app/core/config.py +246 -0
- app/main.py +560 -0
- app/services/agent_service.py +779 -0
- app/services/auto_evaluation_service.py +481 -0
- app/services/chat_service.py +601 -0
- app/services/chunking_service.py +372 -0
- app/services/github_service.py +210 -0
- app/services/tracing_service.py +549 -0
- app/services/vector_service.py +676 -0
- app/storage/__init__.py +34 -0
- app/storage/base.py +159 -0
- app/storage/qdrant_store.py +578 -0
- app/utils/embedding.py +254 -0
- app/utils/github_client.py +478 -0
- app/utils/llm_client.py +108 -0
- app/utils/llm_providers/__init__.py +29 -0
- app/utils/llm_providers/anthropic_provider.py +196 -0
- app/utils/llm_providers/base.py +320 -0
- app/utils/llm_providers/deepseek_provider.py +154 -0
- app/utils/llm_providers/factory.py +171 -0
- app/utils/llm_providers/gemini_provider.py +301 -0
- app/utils/llm_providers/openai_provider.py +145 -0
- app/utils/repo_lock.py +390 -0
- app/utils/retry.py +198 -0
- app/utils/session.py +230 -0
- deploy.sh +143 -0
- docker-compose.yml +102 -0
- evaluation/__init__.py +64 -0
- evaluation/analyze_eval_results.py +379 -0
- evaluation/clean_and_export_sft_data.py +369 -0
- evaluation/data_router.py +222 -0
- evaluation/evaluation_framework.py +512 -0
- evaluation/golden_dataset_builder.py +414 -0
- evaluation/models.py +244 -0
- evaluation/test_retrieval.py +330 -0
- evaluation/utils.py +196 -0
- frontend-dist/assets/Tableau10-B-NsZVaP.js +1 -0
- frontend-dist/assets/arc-BscbqCCW.js +1 -0
- frontend-dist/assets/array-BKyUJesY.js +1 -0
- frontend-dist/assets/blockDiagram-c4efeb88-CL85BYG9.js +118 -0
- frontend-dist/assets/c4Diagram-c83219d4-Dwk4T9_E.js +10 -0
- frontend-dist/assets/channel-DsKT-zfZ.js +1 -0
- frontend-dist/assets/classDiagram-beda092f-wmkRqnN2.js +2 -0
.env.example
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ======================================
|
| 2 |
+
# GitHub Agent Demo - 环境变量配置
|
| 3 |
+
# ======================================
|
| 4 |
+
|
| 5 |
+
# --- LLM 供应商选择 ---
|
| 6 |
+
# 支持: openai, deepseek, anthropic, gemini
|
| 7 |
+
# 默认: deepseek
|
| 8 |
+
LLM_PROVIDER=deepseek
|
| 9 |
+
|
| 10 |
+
# --- API Keys (根据选择的供应商配置对应的 Key) ---
|
| 11 |
+
|
| 12 |
+
# OpenAI (如果 LLM_PROVIDER=openai)
|
| 13 |
+
OPENAI_API_KEY=
|
| 14 |
+
# OPENAI_BASE_URL= # 可选: 自定义端点 (如 Azure OpenAI)
|
| 15 |
+
|
| 16 |
+
# DeepSeek (如果 LLM_PROVIDER=deepseek)
|
| 17 |
+
DEEPSEEK_API_KEY=
|
| 18 |
+
# DEEPSEEK_BASE_URL=https://api.deepseek.com # 可选: 默认值
|
| 19 |
+
|
| 20 |
+
# Anthropic Claude (如果 LLM_PROVIDER=anthropic)
|
| 21 |
+
ANTHROPIC_API_KEY=
|
| 22 |
+
|
| 23 |
+
# Google Gemini (如果 LLM_PROVIDER=gemini)
|
| 24 |
+
GEMINI_API_KEY=
|
| 25 |
+
# GEMINI_BASE_URL= # 可选: OpenAI 兼容端点
|
| 26 |
+
|
| 27 |
+
# --- 模型配置 ---
|
| 28 |
+
# 如果不指定,将使用各供应商的默认模型:
|
| 29 |
+
# - openai: gpt-4o-mini
|
| 30 |
+
# - deepseek: deepseek-chat
|
| 31 |
+
# - anthropic: claude-3-5-sonnet-20241022
|
| 32 |
+
# - gemini: gemini-1.5-flash
|
| 33 |
+
# MODEL_NAME=deepseek-chat
|
| 34 |
+
|
| 35 |
+
# --- GitHub Token ---
|
| 36 |
+
# 用于访问 GitHub API,提高请求限制
|
| 37 |
+
GITHUB_TOKEN=
|
| 38 |
+
|
| 39 |
+
# --- Embedding 服务 ---
|
| 40 |
+
# SiliconFlow API Key (用于 BGE-M3 Embedding)
|
| 41 |
+
SILICON_API_KEY=
|
| 42 |
+
|
| 43 |
+
# --- Langfuse 追踪配置 (可选) ---
|
| 44 |
+
# LANGFUSE_ENABLED=true
|
| 45 |
+
# LANGFUSE_HOST=http://localhost:3000
|
| 46 |
+
# LANGFUSE_PUBLIC_KEY=
|
| 47 |
+
# LANGFUSE_SECRET_KEY=
|
| 48 |
+
|
| 49 |
+
# --- Qdrant 向量数据库配置 ---
|
| 50 |
+
# 模式选择: "local" | "server" | "cloud"
|
| 51 |
+
# - local: 本地嵌入式存储 (开发环境, 单 Worker)
|
| 52 |
+
# - server: Qdrant Server Docker (生产环境, 多 Worker)
|
| 53 |
+
# - cloud: Qdrant Cloud 托管服务
|
| 54 |
+
QDRANT_MODE=local
|
| 55 |
+
QDRANT_LOCAL_PATH=data/qdrant_db
|
| 56 |
+
|
| 57 |
+
# Server 模式: 连接 Qdrant Server (Docker)
|
| 58 |
+
# QDRANT_MODE=server
|
| 59 |
+
# QDRANT_URL=http://localhost:6333
|
| 60 |
+
# 或分开配置:
|
| 61 |
+
# QDRANT_HOST=localhost
|
| 62 |
+
# QDRANT_PORT=6333
|
| 63 |
+
|
| 64 |
+
# Cloud 模式: 连接 Qdrant Cloud
|
| 65 |
+
# QDRANT_MODE=cloud
|
| 66 |
+
# QDRANT_URL=https://xxx.qdrant.tech
|
| 67 |
+
# QDRANT_API_KEY=your-api-key
|
| 68 |
+
|
| 69 |
+
# 向量维度 (BGE-M3 = 1024)
|
| 70 |
+
# QDRANT_VECTOR_SIZE=1024
|
| 71 |
+
|
| 72 |
+
# --- Gunicorn Worker 配置 ---
|
| 73 |
+
# 2核2G服务器建议设为 2
|
| 74 |
+
# 4核8G服务器可设为 4
|
| 75 |
+
GUNICORN_WORKERS=2
|
| 76 |
+
|
| 77 |
+
# --- 分布式锁配置 ---
|
| 78 |
+
# 锁后端: "memory" | "file" | "redis"
|
| 79 |
+
# - memory: 内存锁 (单进程)
|
| 80 |
+
# - file: 文件锁 (多 Worker 单节点)
|
| 81 |
+
# - redis: Redis 分布式锁 (多节点)
|
| 82 |
+
LOCK_BACKEND=file
|
| 83 |
+
LOCK_DIR=data/locks
|
| 84 |
+
# REDIS_URL=redis://localhost:6379/0
|
| 85 |
+
|
| 86 |
+
# --- 服务配置 ---
|
| 87 |
+
HOST=0.0.0.0
|
| 88 |
+
PORT=8000
|
| 89 |
+
|
| 90 |
+
# --- LLM 参数 (可选) ---
|
| 91 |
+
# LLM_TEMPERATURE=0.1
|
| 92 |
+
# LLM_MAX_TOKENS=4096
|
| 93 |
+
# LLM_TIMEOUT=600
|
.github/workflows/sync_to_hub.yml
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
workflow_dispatch:
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
sync-to-hub:
|
| 9 |
+
runs-on: ubuntu-latest
|
| 10 |
+
steps:
|
| 11 |
+
- uses: actions/checkout@v3
|
| 12 |
+
with:
|
| 13 |
+
fetch-depth: 0
|
| 14 |
+
|
| 15 |
+
- name: Push to hub
|
| 16 |
+
env:
|
| 17 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 18 |
+
HF_USERNAME: realdexter
|
| 19 |
+
SPACE_NAME: RepoReaper
|
| 20 |
+
run: |
|
| 21 |
+
echo "🚀 Starting deployment to Hugging Face..."
|
| 22 |
+
|
| 23 |
+
# 1. 配置 Git
|
| 24 |
+
git config --global user.email "bot@github.com"
|
| 25 |
+
git config --global user.name "GitHub Actions Bot"
|
| 26 |
+
|
| 27 |
+
# 2. 【核心魔法】动态生成 Hugging Face 专用的 README
|
| 28 |
+
# 这一步会在发送给 HF 之前,强行在 README.md 顶部插入配置头
|
| 29 |
+
# GitHub 本地的文件不会受影响,依然保持干净漂亮
|
| 30 |
+
echo "---" > hf_header.yml
|
| 31 |
+
echo "title: RepoReaper" >> hf_header.yml
|
| 32 |
+
echo "emoji: 💀" >> hf_header.yml
|
| 33 |
+
echo "colorFrom: blue" >> hf_header.yml
|
| 34 |
+
echo "colorTo: indigo" >> hf_header.yml
|
| 35 |
+
echo "sdk: docker" >> hf_header.yml
|
| 36 |
+
echo "pinned: false" >> hf_header.yml
|
| 37 |
+
echo "app_port: 8000" >> hf_header.yml # 👈 关键:这里指定端口,你就不用改代码了
|
| 38 |
+
echo "---" >> hf_header.yml
|
| 39 |
+
echo "" >> hf_header.yml
|
| 40 |
+
|
| 41 |
+
# 将配置头和原 README 内容拼接
|
| 42 |
+
cat hf_header.yml README.md > README_temp.md
|
| 43 |
+
mv README_temp.md README.md
|
| 44 |
+
|
| 45 |
+
# 3. 清理不需要的文件
|
| 46 |
+
rm -rf docs/
|
| 47 |
+
rm -f *.jpg *.png *.gif hf_header.yml
|
| 48 |
+
rm -rf .git
|
| 49 |
+
|
| 50 |
+
# 4. 初始化新仓库并推送
|
| 51 |
+
git init -b main
|
| 52 |
+
git add .
|
| 53 |
+
git commit -m "deploy: auto-inject hf config & sync"
|
| 54 |
+
|
| 55 |
+
git remote add space https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME
|
| 56 |
+
git push --force space main
|
| 57 |
+
|
| 58 |
+
echo "✅ Deployment successful! Config header injected on-the-fly."
|
.gitignore
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# .gitignore
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
.env
|
| 5 |
+
.venv/
|
| 6 |
+
venv/
|
| 7 |
+
.DS_Store
|
| 8 |
+
data/
|
| 9 |
+
# Vue 构建输出
|
| 10 |
+
#frontend-dist/
|
| 11 |
+
frontend-vue/node_modules/
|
| 12 |
+
frontend-vue/dist/
|
| 13 |
+
|
| 14 |
+
# 锁文件目录
|
| 15 |
+
data/locks/
|
| 16 |
+
|
| 17 |
+
# 日志
|
| 18 |
+
logs/
|
| 19 |
+
*.log
|
| 20 |
+
|
| 21 |
+
# IDE
|
| 22 |
+
.idea/
|
| 23 |
+
.vscode/
|
| 24 |
+
*.swp
|
| 25 |
+
|
| 26 |
+
# 临时文件
|
| 27 |
+
*.tmp
|
| 28 |
+
*.bak
|
| 29 |
+
QUICKSTART.md
|
| 30 |
+
docs/INTERVIEW_QA.md
|
| 31 |
+
docs/ROADMAP.md
|
| 32 |
+
docs/TECHNICAL_REPORT.md
|
| 33 |
+
evaluation/000_START_HERE.md
|
| 34 |
+
evaluation/golden_dataset.json
|
| 35 |
+
evaluation/HIGH_QUALITY_QUESTIONS.md
|
| 36 |
+
|
| 37 |
+
evaluation/README_EVALUATION_SYSTEM.md
|
| 38 |
+
evaluation/ragas_eval_dataset.json
|
| 39 |
+
evaluation/sft_data/eval_results.jsonl
|
| 40 |
+
evaluation/sft_data/negative_samples.jsonl
|
| 41 |
+
evaluation/sft_data/positive_samples.jsonl
|
| 42 |
+
evaluation/sft_data/skipped_samples.jsonl
|
| 43 |
+
evaluation/sft_data/cleaned/rejected_20260128_010745.jsonl
|
Dockerfile
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 1. 基础镜像:选择 Python 3.10 的轻量版 (Slim)
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# 2. 设置环境变量
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 6 |
+
PYTHONUNBUFFERED=1 \
|
| 7 |
+
# 默认 LLM 供应商 (可通过 docker run -e 覆盖)
|
| 8 |
+
LLM_PROVIDER=deepseek
|
| 9 |
+
|
| 10 |
+
# 3. 设置工作目录
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
# 4. 安装系统级依赖
|
| 14 |
+
# build-essential: ChromaDB 编译需要
|
| 15 |
+
# curl: 健康检查
|
| 16 |
+
# git: 某些 pip 包可能需要
|
| 17 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 18 |
+
build-essential \
|
| 19 |
+
curl \
|
| 20 |
+
git \
|
| 21 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 22 |
+
&& apt-get clean
|
| 23 |
+
|
| 24 |
+
# 5. 复制依赖文件并安装 (利用 Docker 层缓存)
|
| 25 |
+
COPY requirements.txt .
|
| 26 |
+
|
| 27 |
+
# 6. 安装 Python 依赖
|
| 28 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 29 |
+
pip install --no-cache-dir -r requirements.txt
|
| 30 |
+
|
| 31 |
+
# 7. 复制项目代码
|
| 32 |
+
COPY . .
|
| 33 |
+
|
| 34 |
+
# 8. 创建数据目录 (Qdrant 本地存储 + 上下文缓存)
|
| 35 |
+
RUN mkdir -p /app/data/qdrant_db /app/data/contexts
|
| 36 |
+
|
| 37 |
+
# 9. 暴露端口
|
| 38 |
+
EXPOSE 8000
|
| 39 |
+
|
| 40 |
+
# 10. 健康检查
|
| 41 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 42 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 43 |
+
|
| 44 |
+
# 11. 启动命令
|
| 45 |
+
CMD ["gunicorn", "-c", "gunicorn_conf.py", "app.main:app"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 tzzp1224
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RepoReaper
|
| 3 |
+
emoji: 💀
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
<div align="center">
|
| 12 |
+
|
| 13 |
+
<img src="./docs/logo.jpg" width="800" style="max-width: 100%;" height="auto" alt="RepoReaper Logo">
|
| 14 |
+
|
| 15 |
+
<h1>RepoReaper</h1>
|
| 16 |
+
|
| 17 |
+
<h3>💀 Harvest Logic. Dissect Architecture. Chat with Code.</h3>
|
| 18 |
+
|
| 19 |
+
<p>
|
| 20 |
+
<a href="./README.md">English</a> •
|
| 21 |
+
<a href="./README_zh.md">简体中文</a>
|
| 22 |
+
</p>
|
| 23 |
+
|
| 24 |
+
<a href="./LICENSE">
|
| 25 |
+
<img src="https://img.shields.io/github/license/tzzp1224/RepoReaper?style=flat-square&color=blue" alt="License">
|
| 26 |
+
</a>
|
| 27 |
+
<img src="https://img.shields.io/badge/Python-3.10+-3776AB?style=flat-square&logo=python&logoColor=white" alt="Python Version">
|
| 28 |
+
<img src="https://img.shields.io/badge/Model-DeepSeek_V3-673AB7?style=flat-square&logo=openai&logoColor=white" alt="DeepSeek Powered">
|
| 29 |
+
<img src="https://img.shields.io/badge/Agent-ReAct-orange?style=flat-square" alt="Agent Architecture">
|
| 30 |
+
|
| 31 |
+
<br>
|
| 32 |
+
|
| 33 |
+
<img src="https://img.shields.io/badge/RAG-Hybrid_Search-009688?style=flat-square" alt="RAG">
|
| 34 |
+
<img src="https://img.shields.io/badge/VectorDB-Qdrant-important?style=flat-square" alt="Qdrant">
|
| 35 |
+
<img src="https://img.shields.io/badge/Framework-FastAPI-005571?style=flat-square&logo=fastapi&logoColor=white" alt="FastAPI">
|
| 36 |
+
<img src="https://img.shields.io/badge/Frontend-Vue_3-4FC08D?style=flat-square&logo=vue.js&logoColor=white" alt="Vue 3">
|
| 37 |
+
<img src="https://img.shields.io/badge/Docker-Ready-2496ED?style=flat-square&logo=docker&logoColor=white" alt="Docker">
|
| 38 |
+
|
| 39 |
+
<br>
|
| 40 |
+
<br>
|
| 41 |
+
|
| 42 |
+
<p>
|
| 43 |
+
<b>👇 Live Demo / 在线体验 👇</b>
|
| 44 |
+
</p>
|
| 45 |
+
<p align="center">
|
| 46 |
+
<a href="https://realdexter-reporeaper.hf.space" target="_blank" rel="noopener noreferrer">
|
| 47 |
+
<img src="https://img.shields.io/badge/🤗%20Hugging%20Face-Global%20Demo-ffd21e?style=for-the-badge&logo=huggingface&logoColor=black" alt="Global Demo" height="45">
|
| 48 |
+
</a>
|
| 49 |
+
|
| 50 |
+
<a href="https://repo.realdexter.com/" target="_blank" rel="noopener noreferrer">
|
| 51 |
+
<img src="https://img.shields.io/badge/🚀%20Seoul%20Server-CN%20Optimized-red?style=for-the-badge&logo=rocket&logoColor=white" alt="China Demo" height="45">
|
| 52 |
+
</a>
|
| 53 |
+
</p>
|
| 54 |
+
|
| 55 |
+
<p align="center">
|
| 56 |
+
<small>
|
| 57 |
+
⚠️ Public demos use shared API quotas. Deploy locally for the best experience.
|
| 58 |
+
</small>
|
| 59 |
+
</p>
|
| 60 |
+
|
| 61 |
+
<br>
|
| 62 |
+
|
| 63 |
+
<img src="./docs/demo_preview.gif" width="800" style="max-width: 100%; box-shadow: 0 4px 8px rgba(0,0,0,0.1); border-radius: 8px;" alt="RepoReaper Demo">
|
| 64 |
+
|
| 65 |
+
<br>
|
| 66 |
+
</div>
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
An autonomous Agent that dissects any GitHub repository. It maps code architecture, warms up semantic cache, and answers questions with Just-In-Time context retrieval.
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
## ✨ Key Features
|
| 75 |
+
|
| 76 |
+
| Feature | Description |
|
| 77 |
+
|:--------|:------------|
|
| 78 |
+
| **Multi-Language AST Parsing** | Python AST + Regex patterns for Java, TypeScript, Go, Rust, etc. |
|
| 79 |
+
| **Hybrid Search** | Qdrant vectors + BM25 with RRF fusion |
|
| 80 |
+
| **JIT Context Loading** | Auto-fetches missing files during Q&A |
|
| 81 |
+
| **Query Rewrite** | Translates natural language to code keywords |
|
| 82 |
+
| **End-to-End Tracing** | Langfuse integration for observability |
|
| 83 |
+
| **Auto Evaluation** | LLM-as-Judge scoring pipeline |
|
| 84 |
+
|
| 85 |
+
---
|
| 86 |
+
|
| 87 |
+
## 🏗 Architecture
|
| 88 |
+
|
| 89 |
+
```
|
| 90 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 91 |
+
│ Vue 3 Frontend (SSE Streaming + Mermaid Diagrams) │
|
| 92 |
+
└─────────────────────┬───────────────────────────────────────┘
|
| 93 |
+
│
|
| 94 |
+
┌─────────────────────▼───────────────────────────────────────┐
|
| 95 |
+
│ FastAPI Backend │
|
| 96 |
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
|
| 97 |
+
│ │ Agent │ │ Chat │ │ Evaluation │ │
|
| 98 |
+
│ │ Service │ │ Service │ │ Framework │ │
|
| 99 |
+
│ └──────┬──────┘ └──────┬──────┘ └─────────────────────┘ │
|
| 100 |
+
│ │ │ │
|
| 101 |
+
│ ┌──────▼───────────────▼──────┐ ┌─────────────────────┐ │
|
| 102 |
+
│ │ Vector Service (Qdrant+BM25)│ │ Tracing (Langfuse) │ │
|
| 103 |
+
│ └─────────────────────────────┘ └─────────────────────┘ │
|
| 104 |
+
└─────────────────────────────────────────────────────────────┘
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## 🛠 Tech Stack
|
| 110 |
+
|
| 111 |
+
**Backend:** Python 3.10+ · FastAPI · AsyncIO · Qdrant · BM25
|
| 112 |
+
**Frontend:** Vue 3 · Pinia · Mermaid.js · SSE
|
| 113 |
+
**LLM:** DeepSeek V3 · SiliconFlow BGE-M3
|
| 114 |
+
**Ops:** Docker · Gunicorn · Langfuse
|
| 115 |
+
|
| 116 |
+
---
|
| 117 |
+
|
| 118 |
+
## 🏁 Quick Start
|
| 119 |
+
|
| 120 |
+
**Prerequisites:** Python 3.10+ · (Optional) Node 18+ for rebuilding frontend · GitHub Token (recommended) · LLM API Key (required)
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
# Clone & Setup
|
| 124 |
+
git clone https://github.com/tzzp1224/RepoReaper.git && cd RepoReaper
|
| 125 |
+
python -m venv venv && source venv/bin/activate
|
| 126 |
+
pip install -r requirements.txt
|
| 127 |
+
|
| 128 |
+
# Configure .env (copy from example and fill in your keys)
|
| 129 |
+
cp .env.example .env
|
| 130 |
+
# Required: set LLM_PROVIDER and the matching *_API_KEY
|
| 131 |
+
# Recommended: GITHUB_TOKEN and SILICON_API_KEY (embeddings)
|
| 132 |
+
|
| 133 |
+
# (Optional) Build frontend (repo already contains frontend-dist)
|
| 134 |
+
cd frontend-vue
|
| 135 |
+
npm install
|
| 136 |
+
npm run build
|
| 137 |
+
cd ..
|
| 138 |
+
|
| 139 |
+
# Run
|
| 140 |
+
python -m app.main
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
Open `http://localhost:8000` and paste any GitHub repo URL.
|
| 144 |
+
|
| 145 |
+
**Docker (single container, local Qdrant):**
|
| 146 |
+
```bash
|
| 147 |
+
cp .env.example .env
|
| 148 |
+
docker build -t reporeaper .
|
| 149 |
+
docker run -d -p 8000:8000 --env-file .env reporeaper
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
**Docker Compose (recommended, with Qdrant Server):**
|
| 153 |
+
```bash
|
| 154 |
+
cp .env.example .env
|
| 155 |
+
# Set QDRANT_MODE=server and QDRANT_URL=http://qdrant:6333 in .env
|
| 156 |
+
docker compose up -d --build
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
## 📊 Evaluation & Tracing Status
|
| 164 |
+
|
| 165 |
+
| Component | Status | Notes |
|
| 166 |
+
|:----------|:------:|:------|
|
| 167 |
+
| **Self-built Eval Engine** | ✅ Working | 4-layer metrics (QueryRewrite / Retrieval / Generation / Agentic), LLM-as-Judge |
|
| 168 |
+
| **Auto Evaluation** | ✅ Working | Triggers after every `/chat`, async, writes to `evaluation/sft_data/` |
|
| 169 |
+
| **Data Routing (SFT)** | ✅ Working | Auto-grades Gold/Silver/Bronze/Rejected → JSONL files |
|
| 170 |
+
| **Eval API Endpoints** | ✅ Working | `/evaluate`, `/evaluation/stats`, `/dashboard/*`, `/auto-eval/*` (7 endpoints) |
|
| 171 |
+
| **Offline Retrieval Eval** | ✅ Working | `test_retrieval.py` — Hit Rate, Recall@K, Precision@K, MRR |
|
| 172 |
+
| **Langfuse Tracing** | ⚠️ Partial | Framework + 14 call sites wired in agent/chat services; falls back to local JSON logs (`logs/traces/`) when Langfuse unavailable |
|
| 173 |
+
| **Ragas Integration** | ❌ Placeholder | `use_ragas=False` by default; `_ragas_eval()` API call doesn't match latest Ragas SDK |
|
| 174 |
+
| **Langfuse ↔ Eval** | ❌ Not connected | Eval results only write JSONL, not reported to Langfuse Scores API |
|
| 175 |
+
|
| 176 |
+
> **Overall completion: ~65%** — the self-built eval loop is production-ready; Ragas and Langfuse integrations are scaffolded but not functional.
|
| 177 |
+
|
| 178 |
+
---
|
| 179 |
+
|
| 180 |
+
## ⚠️ Known Issues
|
| 181 |
+
|
| 182 |
+
1. **Python 3.14 + Langfuse import error**
|
| 183 |
+
`pydantic.V1.errors.ConfigError: unable to infer type for attribute "description"` — Langfuse 3.x internally uses `pydantic.v1` compat layer which breaks on Python 3.14.
|
| 184 |
+
**Workaround:** set `LANGFUSE_ENABLED=false` in `.env`, or use Python 3.10–3.12.
|
| 185 |
+
|
| 186 |
+
2. **Langfuse Server not included in `docker-compose.yml`**
|
| 187 |
+
Even if the import works, you need a running Langfuse instance. Add it yourself or use [app.langfuse.com](https://app.langfuse.com).
|
| 188 |
+
|
| 189 |
+
3. **Trace spans are not linked**
|
| 190 |
+
`tracing_service` records spans/events but doesn't pass `trace_id` to Langfuse API calls — the Langfuse UI will show isolated events instead of a connected trace tree.
|
| 191 |
+
|
| 192 |
+
4. **Ragas `_ragas_eval()` uses outdated API**
|
| 193 |
+
Passes a plain dict to `ragas.evaluate()`, but latest Ragas requires a `Dataset` object. The `ragas_eval_dataset.json` export exists but no script consumes it.
|
| 194 |
+
|
| 195 |
+
5. **Golden dataset has no reference answers**
|
| 196 |
+
All 26 test cases have `expected_answer: ""` — generation quality cannot be compared against ground truth.
|
| 197 |
+
|
| 198 |
+
6. **Heuristic fallback is coarse**
|
| 199 |
+
When no LLM client is available, `faithfulness` uses keyword overlap + 0.2 baseline; `completeness` is purely length-based.
|
| 200 |
+
|
| 201 |
+
---
|
| 202 |
+
|
| 203 |
+
## 🗺 Roadmap
|
| 204 |
+
|
| 205 |
+
- [ ] **Fix Langfuse compat** — pin `langfuse`/`pydantic` versions or gate import behind Python version check
|
| 206 |
+
- [ ] **Add Langfuse to `docker-compose.yml`** — one-command local observability
|
| 207 |
+
- [ ] **Wire trace_id through spans** — enable full trace tree in Langfuse UI
|
| 208 |
+
- [ ] **Integrate Ragas properly** — update `_ragas_eval()` to use `ragas.evaluate(Dataset(...))`, add a standalone eval script
|
| 209 |
+
- [ ] **Enrich golden dataset** — add `expected_answer` for generation benchmarking, expand to 50+ cases
|
| 210 |
+
- [ ] **Eval dashboard frontend** — Vue component to visualize quality distribution and bad cases
|
| 211 |
+
- [ ] **CI regression baseline** — run `test_retrieval.py` in GitHub Actions, fail on metric regression
|
| 212 |
+
- [ ] **Export to Langfuse Datasets** — push eval results to Langfuse Scores/Datasets API for unified observability
|
| 213 |
+
|
| 214 |
+
---
|
| 215 |
+
|
| 216 |
+
## 📈 Star History
|
| 217 |
+
|
| 218 |
+
<a href="https://star-history.com/#tzzp1224/RepoReaper&Date">
|
| 219 |
+
<picture>
|
| 220 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date&theme=dark" />
|
| 221 |
+
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date" />
|
| 222 |
+
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date" />
|
| 223 |
+
</picture>
|
| 224 |
+
</a>
|
README_zh.md
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div align="center">
|
| 2 |
+
|
| 3 |
+
<img src="./docs/logo.jpg" width="800" style="max-width: 100%;" height="auto" alt="RepoReaper Logo">
|
| 4 |
+
|
| 5 |
+
<h1>RepoReaper</h1>
|
| 6 |
+
|
| 7 |
+
<h3>💀 Harvest Logic. Dissect Architecture. Chat with Code.</h3>
|
| 8 |
+
|
| 9 |
+
<p>
|
| 10 |
+
<a href="./README.md">English</a> •
|
| 11 |
+
<strong>简体中文</strong>
|
| 12 |
+
</p>
|
| 13 |
+
|
| 14 |
+
<a href="./LICENSE">
|
| 15 |
+
<img src="https://img.shields.io/github/license/tzzp1224/RepoReaper?style=flat-square&color=blue" alt="License">
|
| 16 |
+
</a>
|
| 17 |
+
<img src="https://img.shields.io/badge/Python-3.10+-3776AB?style=flat-square&logo=python&logoColor=white" alt="Python Version">
|
| 18 |
+
<img src="https://img.shields.io/badge/Model-DeepSeek_V3-673AB7?style=flat-square&logo=openai&logoColor=white" alt="DeepSeek Powered">
|
| 19 |
+
<img src="https://img.shields.io/badge/Agent-ReAct-orange?style=flat-square" alt="Agent Architecture">
|
| 20 |
+
|
| 21 |
+
<br>
|
| 22 |
+
|
| 23 |
+
<img src="https://img.shields.io/badge/RAG-Hybrid_Search-009688?style=flat-square" alt="RAG">
|
| 24 |
+
<img src="https://img.shields.io/badge/VectorDB-Qdrant-important?style=flat-square" alt="Qdrant">
|
| 25 |
+
<img src="https://img.shields.io/badge/Framework-FastAPI-005571?style=flat-square&logo=fastapi&logoColor=white" alt="FastAPI">
|
| 26 |
+
<img src="https://img.shields.io/badge/Frontend-Vue_3-4FC08D?style=flat-square&logo=vue.js&logoColor=white" alt="Vue 3">
|
| 27 |
+
<img src="https://img.shields.io/badge/Docker-Ready-2496ED?style=flat-square&logo=docker&logoColor=white" alt="Docker">
|
| 28 |
+
|
| 29 |
+
<br>
|
| 30 |
+
<br>
|
| 31 |
+
|
| 32 |
+
<p>
|
| 33 |
+
<b>👇 在线体验 👇</b>
|
| 34 |
+
</p>
|
| 35 |
+
<p align="center">
|
| 36 |
+
<a href="https://realdexter-reporeaper.hf.space" target="_blank" rel="noopener noreferrer">
|
| 37 |
+
<img src="https://img.shields.io/badge/🤗%20Hugging%20Face-Global%20Demo-ffd21e?style=for-the-badge&logo=huggingface&logoColor=black" alt="Global Demo" height="45">
|
| 38 |
+
</a>
|
| 39 |
+
|
| 40 |
+
<a href="https://repo.realdexter.com/" target="_blank" rel="noopener noreferrer">
|
| 41 |
+
<img src="https://img.shields.io/badge/🚀%20Seoul%20Server-国内优化-red?style=for-the-badge&logo=rocket&logoColor=white" alt="China Demo" height="45">
|
| 42 |
+
</a>
|
| 43 |
+
</p>
|
| 44 |
+
|
| 45 |
+
<p align="center">
|
| 46 |
+
<small>
|
| 47 |
+
⚠️ 中国用户请使用 Seoul Server。如遇限流,建议本地部署。
|
| 48 |
+
</small>
|
| 49 |
+
</p>
|
| 50 |
+
|
| 51 |
+
<br>
|
| 52 |
+
|
| 53 |
+
<img src="./docs/demo_preview.gif" width="800" style="max-width: 100%; box-shadow: 0 4px 8px rgba(0,0,0,0.1); border-radius: 8px;" alt="RepoReaper Demo">
|
| 54 |
+
|
| 55 |
+
<br>
|
| 56 |
+
</div>
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
自治型代码审计 Agent:解析任意 GitHub 仓库架构,构建语义缓存,支持即时上下文检索问答。
|
| 61 |
+
|
| 62 |
+
---
|
| 63 |
+
|
| 64 |
+
## ✨ 核心特性
|
| 65 |
+
|
| 66 |
+
| 特性 | 说明 |
|
| 67 |
+
|:----|:----|
|
| 68 |
+
| **多语言 AST 解析** | Python AST + 正则适配 Java / TS / Go / Rust 等 |
|
| 69 |
+
| **混合检索** | Qdrant 向量 + BM25 关键词,RRF 融合排序 |
|
| 70 |
+
| **JIT 动态加载** | 问答时自动拉取缺失文件 |
|
| 71 |
+
| **查询重写** | 自然语言 → 代码检索关键词 |
|
| 72 |
+
| **端到端追踪** | Langfuse 集成,全链路可观测 |
|
| 73 |
+
| **自动评估** | LLM-as-Judge 质量评分 |
|
| 74 |
+
|
| 75 |
+
---
|
| 76 |
+
|
| 77 |
+
## 🏗 系统架构
|
| 78 |
+
|
| 79 |
+
```
|
| 80 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 81 |
+
│ Vue 3 前端 (SSE 流式 + Mermaid 架构图) │
|
| 82 |
+
└─────────────────────┬───────────────────────────────────────┘
|
| 83 |
+
│
|
| 84 |
+
┌─────────────────────▼───────────────────────────────────────┐
|
| 85 |
+
│ FastAPI 后端 │
|
| 86 |
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
|
| 87 |
+
│ │ Agent │ │ Chat │ │ Evaluation │ │
|
| 88 |
+
│ │ Service │ │ Service │ │ Framework │ │
|
| 89 |
+
│ └──────┬──────┘ └──────┬──────┘ └─────────────────────┘ │
|
| 90 |
+
│ │ │ │
|
| 91 |
+
│ ┌──────▼───────────────▼──────┐ ┌─────────────────────┐ │
|
| 92 |
+
│ │ Vector Service (Qdrant+BM25)│ │ Tracing (Langfuse) │ │
|
| 93 |
+
│ └─────────────────────────────┘ └─────────────────────┘ │
|
| 94 |
+
└─────────────────────────────────────────────────────────────┘
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## 🛠 技术栈
|
| 100 |
+
|
| 101 |
+
**后端:** Python 3.10+ · FastAPI · AsyncIO · Qdrant · BM25
|
| 102 |
+
**前端:** Vue 3 · Pinia · Mermaid.js · SSE
|
| 103 |
+
**模型:** DeepSeek V3 · SiliconFlow BGE-M3
|
| 104 |
+
**运维:** Docker · Gunicorn · Langfuse
|
| 105 |
+
|
| 106 |
+
---
|
| 107 |
+
|
| 108 |
+
## 🏁 快速开始
|
| 109 |
+
|
| 110 |
+
**前置要求:** Python 3.10+ ·(可选)Node 18+ 用于重新构建前端 · GitHub Token(推荐)· LLM API Key(必需)
|
| 111 |
+
|
| 112 |
+
```bash
|
| 113 |
+
# 克隆 & 安装
|
| 114 |
+
git clone https://github.com/tzzp1224/RepoReaper.git && cd RepoReaper
|
| 115 |
+
python -m venv venv && source venv/bin/activate
|
| 116 |
+
pip install -r requirements.txt
|
| 117 |
+
|
| 118 |
+
# 配置 .env(建议从示例复制)
|
| 119 |
+
cp .env.example .env
|
| 120 |
+
# 必需:设置 LLM_PROVIDER 以及对应的 *_API_KEY
|
| 121 |
+
# 推荐:GITHUB_TOKEN 和 SILICON_API_KEY(Embedding)
|
| 122 |
+
|
| 123 |
+
# (可选)构建前端(仓库已包含 frontend-dist)
|
| 124 |
+
cd frontend-vue
|
| 125 |
+
npm install
|
| 126 |
+
npm run build
|
| 127 |
+
cd ..
|
| 128 |
+
|
| 129 |
+
# 启动
|
| 130 |
+
python -m app.main
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
访问 `http://localhost:8000`,输入任意 GitHub 仓库地址开始审计。
|
| 134 |
+
|
| 135 |
+
**Docker(单容器,本地 Qdrant):**
|
| 136 |
+
```bash
|
| 137 |
+
cp .env.example .env
|
| 138 |
+
docker build -t reporeaper .
|
| 139 |
+
docker run -d -p 8000:8000 --env-file .env reporeaper
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
**Docker Compose(推荐,包含 Qdrant Server):**
|
| 143 |
+
```bash
|
| 144 |
+
cp .env.example .env
|
| 145 |
+
# 在 .env 中设置 QDRANT_MODE=server 与 QDRANT_URL=http://qdrant:6333
|
| 146 |
+
docker compose up -d --build
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
---
|
| 150 |
+
|
| 151 |
+
## 📊 评估与追踪现状
|
| 152 |
+
|
| 153 |
+
| 组件 | 状态 | 说明 |
|
| 154 |
+
|:----|:----:|:----|
|
| 155 |
+
| **自研评估引擎** | ✅ 可用 | 四层指标(QueryRewrite / Retrieval / Generation / Agentic),LLM-as-Judge 判分 |
|
| 156 |
+
| **在线自动评估** | ✅ 可用 | 每次 `/chat` 结束后异步触发,结果写入 `evaluation/sft_data/` |
|
| 157 |
+
| **数据路由 (SFT)** | ✅ 可用 | 按评分自动分流 Gold/Silver/Bronze/Rejected → JSONL 文件 |
|
| 158 |
+
| **评估 API** | ✅ 可用 | `/evaluate`、`/evaluation/stats`、`/dashboard/*`、`/auto-eval/*` 共 7 个端点 |
|
| 159 |
+
| **离线检索评估** | ✅ 可用 | `test_retrieval.py` — Hit Rate、Recall@K、Precision@K、MRR |
|
| 160 |
+
| **Langfuse 追踪** | ⚠️ 部分完成 | 框架 + 14 处埋点已就位(agent/chat service);不可用时自动降级为本地日志 `logs/traces/` |
|
| 161 |
+
| **Ragas 集成** | ❌ 占位 | 默认 `use_ragas=False`;`_ragas_eval()` 调用方式与最新 Ragas SDK 不兼容 |
|
| 162 |
+
| **Langfuse ↔ 评估** | ❌ 未打通 | 评估结果仅写 JSONL,未上报 Langfuse Scores API |
|
| 163 |
+
|
| 164 |
+
> **综合完成度约 65%**:自研评估链路已闭环可用;Ragas 与 Langfuse 集成均为半成品。
|
| 165 |
+
|
| 166 |
+
---
|
| 167 |
+
|
| 168 |
+
## ⚠️ 已知问题
|
| 169 |
+
|
| 170 |
+
1. **Python 3.14 + Langfuse 导入报错**
|
| 171 |
+
`pydantic.V1.errors.ConfigError: unable to infer type for attribute "description"` — Langfuse 3.x 内部依赖 `pydantic.v1` 兼容层,在 Python 3.14 下不兼容。
|
| 172 |
+
**临时方案:** 在 `.env` 中设置 `LANGFUSE_ENABLED=false`,或使用 Python 3.10–3.12。
|
| 173 |
+
|
| 174 |
+
2. **`docker-compose.yml` 未包含 Langfuse 服务**
|
| 175 |
+
即使导入成功,仍需运行中的 Langfuse 实例。请自行添加或使用 [app.langfuse.com](https://app.langfuse.com)。
|
| 176 |
+
|
| 177 |
+
3. **Trace 链路未关联**
|
| 178 |
+
`tracing_service` 记录了 span/event,但调用 Langfuse API 时未传 `trace_id`,Langfuse UI 中只能看到孤立事件而非完整链路树。
|
| 179 |
+
|
| 180 |
+
4. **Ragas `_ragas_eval()` API 过时**
|
| 181 |
+
当前向 `ragas.evaluate()` 传递 dict,最新 Ragas 要求 `Dataset` 对象。已导出 `ragas_eval_dataset.json` 但无脚本消费它。
|
| 182 |
+
|
| 183 |
+
5. **黄金数据集缺少标准答案**
|
| 184 |
+
26 条测试用例的 `expected_answer` 均为空,无法做生成质量的 ground truth 对比。
|
| 185 |
+
|
| 186 |
+
6. **启发式降级较粗糙**
|
| 187 |
+
无 LLM client 时,`faithfulness` 用关键词重叠 + 0.2 基础分;`completeness` 纯粹按字数判断。
|
| 188 |
+
|
| 189 |
+
---
|
| 190 |
+
|
| 191 |
+
## 🗺 路线图
|
| 192 |
+
|
| 193 |
+
- [ ] **修复 Langfuse 兼容性** — 固定 `langfuse`/`pydantic` 版本或按 Python 版本门控导入
|
| 194 |
+
- [ ] **`docker-compose.yml` 加入 Langfuse** — 一键启动本地可观测平台
|
| 195 |
+
- [ ] **串联 trace_id** — 让 Langfuse UI 展示完整链路树
|
| 196 |
+
- [ ] **正式接入 Ragas** — 更新 `_ragas_eval()` 使用 `ragas.evaluate(Dataset(...))`,新增独立评估脚本
|
| 197 |
+
- [ ] **丰富黄金数据集** — 补充 `expected_answer`,扩展至 50+ 条用例
|
| 198 |
+
- [ ] **评估仪表盘前端** — Vue 组件可视化质量分布与 Bad Case
|
| 199 |
+
- [ ] **CI 回归基线** — 在 GitHub Actions 中运行 `test_retrieval.py`,指标回退时失败
|
| 200 |
+
- [ ] **对接 Langfuse Datasets** — 将评估结果推送到 Langfuse Scores/Datasets API,统一可观测
|
| 201 |
+
|
| 202 |
+
---
|
| 203 |
+
|
| 204 |
+
## 📈 Star History
|
| 205 |
+
|
| 206 |
+
<a href="https://star-history.com/#tzzp1224/RepoReaper&Date">
|
| 207 |
+
<picture>
|
| 208 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date&theme=dark" />
|
| 209 |
+
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date" />
|
| 210 |
+
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date" />
|
| 211 |
+
</picture>
|
| 212 |
+
</a>
|
app/core/config.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/core/config.py
|
| 2 |
+
"""
|
| 3 |
+
应用配置模块 - 统一配置中心
|
| 4 |
+
|
| 5 |
+
支持多 LLM 供应商配置:
|
| 6 |
+
- OpenAI (GPT-4, GPT-4o 等)
|
| 7 |
+
- DeepSeek (deepseek-chat 等)
|
| 8 |
+
- Anthropic (Claude 系列)
|
| 9 |
+
- Google Gemini (gemini-3-flash-preview 等)
|
| 10 |
+
"""
|
| 11 |
+
import os
|
| 12 |
+
from dataclasses import dataclass, field
|
| 13 |
+
from typing import Optional, Tuple
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
|
| 16 |
+
# 加载 .env 文件
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ============================================================
|
| 21 |
+
# Agent 分析配置
|
| 22 |
+
# ============================================================
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class AgentAnalysisConfig:
|
| 26 |
+
"""Agent 分析引擎配置"""
|
| 27 |
+
# Repo Map 配置
|
| 28 |
+
initial_map_limit: int = 25 # 初始 Repo Map 文件数量 (提高精度)
|
| 29 |
+
max_symbols_per_file: int = 40 # 每文件最大符号数 (提高精度)
|
| 30 |
+
|
| 31 |
+
# 分析轮次配置
|
| 32 |
+
max_rounds: int = 4 # 最大分析轮数 (提高精度,因为报告可复用)
|
| 33 |
+
files_per_round: int = 5 # 每轮选择文件数 (提高精度)
|
| 34 |
+
max_context_length: int = 20000 # 上下文最大长度 (提高精度)
|
| 35 |
+
|
| 36 |
+
# 优先级配置
|
| 37 |
+
priority_exts: Tuple[str, ...] = (
|
| 38 |
+
'.py', '.java', '.go', '.js', '.ts', '.tsx', '.cpp', '.cs', '.rs'
|
| 39 |
+
)
|
| 40 |
+
priority_keywords: Tuple[str, ...] = (
|
| 41 |
+
'main', 'app', 'core', 'api', 'service', 'utils', 'controller', 'model', 'config'
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ============================================================
|
| 46 |
+
# 向量服务配置
|
| 47 |
+
# ============================================================
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class VectorServiceConfig:
|
| 51 |
+
"""向量服务配置"""
|
| 52 |
+
# 数据目录
|
| 53 |
+
data_dir: str = "data"
|
| 54 |
+
context_dir: str = "data/contexts"
|
| 55 |
+
cache_version: str = "2.0"
|
| 56 |
+
|
| 57 |
+
# Embedding 配置
|
| 58 |
+
embedding_api_url: str = "https://api.siliconflow.cn/v1"
|
| 59 |
+
embedding_model: str = "BAAI/bge-m3"
|
| 60 |
+
embedding_batch_size: int = 50
|
| 61 |
+
embedding_max_length: int = 8000
|
| 62 |
+
embedding_concurrency: int = 5
|
| 63 |
+
embedding_dimensions: int = 1024
|
| 64 |
+
|
| 65 |
+
# BM25 配置
|
| 66 |
+
tokenize_regex: str = r'[^a-zA-Z0-9_\.@\u4e00-\u9fa5]+'
|
| 67 |
+
|
| 68 |
+
# 混合搜索 RRF 参数
|
| 69 |
+
rrf_k: int = 60
|
| 70 |
+
rrf_weight_vector: float = 1.0
|
| 71 |
+
rrf_weight_bm25: float = 0.3
|
| 72 |
+
search_oversample: int = 2
|
| 73 |
+
default_top_k: int = 3
|
| 74 |
+
|
| 75 |
+
# Session LRU 缓存配置
|
| 76 |
+
session_max_count: int = 100 # 内存中最大 session 数
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ============================================================
|
| 80 |
+
# 对话记忆配置
|
| 81 |
+
# ============================================================
|
| 82 |
+
|
| 83 |
+
@dataclass
|
| 84 |
+
class ConversationConfig:
|
| 85 |
+
"""对话记忆配置"""
|
| 86 |
+
# 滑动窗口
|
| 87 |
+
max_recent_turns: int = 10 # 保留最近 N 轮对话
|
| 88 |
+
max_context_tokens: int = 8000 # 最大上下文 token 数
|
| 89 |
+
summary_threshold: int = 15 # 超过 N 轮开始压缩
|
| 90 |
+
# 对话记忆是纯内存存储,服务重启自动清空,无需定时清理
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# ============================================================
|
| 94 |
+
# Qdrant 配置
|
| 95 |
+
# ============================================================
|
| 96 |
+
|
| 97 |
+
@dataclass
|
| 98 |
+
class QdrantServiceConfig:
|
| 99 |
+
"""
|
| 100 |
+
Qdrant 向量数据库配置
|
| 101 |
+
|
| 102 |
+
支持三种模式 (通过环境变量 QDRANT_MODE 切换):
|
| 103 |
+
- local: 本地嵌入式存储 (开发环境, 单 Worker)
|
| 104 |
+
- server: Qdrant Server Docker (生产环境, 多 Worker)
|
| 105 |
+
- cloud: Qdrant Cloud 托管服务
|
| 106 |
+
|
| 107 |
+
环境变量:
|
| 108 |
+
- QDRANT_MODE: "local" | "server" | "cloud"
|
| 109 |
+
- QDRANT_URL: 服务器 URL (server/cloud 模式)
|
| 110 |
+
- QDRANT_API_KEY: API 密钥 (cloud 模式必需)
|
| 111 |
+
- QDRANT_LOCAL_PATH: 本地存储路径 (local 模式)
|
| 112 |
+
"""
|
| 113 |
+
mode: str = os.getenv("QDRANT_MODE", "local")
|
| 114 |
+
url: str = os.getenv("QDRANT_URL", "")
|
| 115 |
+
host: str = os.getenv("QDRANT_HOST", "localhost")
|
| 116 |
+
port: int = int(os.getenv("QDRANT_PORT", "6333"))
|
| 117 |
+
grpc_port: int = int(os.getenv("QDRANT_GRPC_PORT", "6334"))
|
| 118 |
+
prefer_grpc: bool = True
|
| 119 |
+
api_key: str = os.getenv("QDRANT_API_KEY", "")
|
| 120 |
+
|
| 121 |
+
local_path: str = os.getenv("QDRANT_LOCAL_PATH", "data/qdrant_db")
|
| 122 |
+
|
| 123 |
+
vector_size: int = 1024 # BGE-M3 维度
|
| 124 |
+
hnsw_m: int = 16
|
| 125 |
+
hnsw_ef_construct: int = 100
|
| 126 |
+
batch_size: int = 100
|
| 127 |
+
timeout: float = 30.0
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# ============================================================
|
| 131 |
+
# LLM 供应商配置
|
| 132 |
+
# ============================================================
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class Settings:
|
| 136 |
+
"""应用配置类"""
|
| 137 |
+
|
| 138 |
+
# --- LLM 供应商选择 ---
|
| 139 |
+
# 支持: "openai", "deepseek", "anthropic", "gemini"
|
| 140 |
+
LLM_PROVIDER = os.getenv("LLM_PROVIDER", "deepseek")
|
| 141 |
+
|
| 142 |
+
# --- API Keys (根据选择的供应商配置对应的 Key) ---
|
| 143 |
+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
|
| 144 |
+
|
| 145 |
+
# OpenAI
|
| 146 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 147 |
+
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL") # 可选自定义端点
|
| 148 |
+
|
| 149 |
+
# DeepSeek
|
| 150 |
+
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
|
| 151 |
+
DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
|
| 152 |
+
|
| 153 |
+
# Anthropic (Claude)
|
| 154 |
+
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
|
| 155 |
+
|
| 156 |
+
# Google Gemini
|
| 157 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 158 |
+
GEMINI_BASE_URL = os.getenv("GEMINI_BASE_URL") # 可选 OpenAI 兼容端点
|
| 159 |
+
|
| 160 |
+
# SiliconFlow (Embedding)
|
| 161 |
+
SILICON_API_KEY = os.getenv("SILICON_API_KEY")
|
| 162 |
+
|
| 163 |
+
# --- 模型配置 ---
|
| 164 |
+
# 如果不指定,将使用各供应商的默认模型
|
| 165 |
+
MODEL_NAME = os.getenv("MODEL_NAME")
|
| 166 |
+
|
| 167 |
+
# --- 服务配置 ---
|
| 168 |
+
HOST = os.getenv("HOST", "127.0.0.1")
|
| 169 |
+
PORT = int(os.getenv("PORT", 8000))
|
| 170 |
+
|
| 171 |
+
# --- LLM 默认参数 ---
|
| 172 |
+
LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.1"))
|
| 173 |
+
LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "4096"))
|
| 174 |
+
LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "600"))
|
| 175 |
+
|
| 176 |
+
@property
|
| 177 |
+
def current_api_key(self) -> Optional[str]:
|
| 178 |
+
"""获取当前选择的供应商的 API Key"""
|
| 179 |
+
key_mapping = {
|
| 180 |
+
"openai": self.OPENAI_API_KEY,
|
| 181 |
+
"deepseek": self.DEEPSEEK_API_KEY,
|
| 182 |
+
"anthropic": self.ANTHROPIC_API_KEY,
|
| 183 |
+
"gemini": self.GEMINI_API_KEY,
|
| 184 |
+
}
|
| 185 |
+
return key_mapping.get(self.LLM_PROVIDER.lower())
|
| 186 |
+
|
| 187 |
+
@property
|
| 188 |
+
def current_base_url(self) -> Optional[str]:
|
| 189 |
+
"""获取当前选择的供应商的 Base URL"""
|
| 190 |
+
url_mapping = {
|
| 191 |
+
"openai": self.OPENAI_BASE_URL,
|
| 192 |
+
"deepseek": self.DEEPSEEK_BASE_URL,
|
| 193 |
+
"anthropic": None,
|
| 194 |
+
"gemini": self.GEMINI_BASE_URL,
|
| 195 |
+
}
|
| 196 |
+
return url_mapping.get(self.LLM_PROVIDER.lower())
|
| 197 |
+
|
| 198 |
+
@property
|
| 199 |
+
def default_model_name(self) -> str:
|
| 200 |
+
"""获取当前供应商的默认模型名称"""
|
| 201 |
+
defaults = {
|
| 202 |
+
"openai": "gpt-4o-mini",
|
| 203 |
+
"deepseek": "deepseek-chat",
|
| 204 |
+
"anthropic": "claude-3-5-sonnet-20241022",
|
| 205 |
+
"gemini": "gemini-3-flash-preview",
|
| 206 |
+
}
|
| 207 |
+
return self.MODEL_NAME or defaults.get(self.LLM_PROVIDER.lower(), "default")
|
| 208 |
+
|
| 209 |
+
def validate(self):
|
| 210 |
+
"""启动时检查必要的配置是否存在"""
|
| 211 |
+
provider = self.LLM_PROVIDER.lower()
|
| 212 |
+
print(f"🔧 LLM Provider: {provider.upper()}")
|
| 213 |
+
|
| 214 |
+
# 1. 检查选择的供应商的 API Key
|
| 215 |
+
if not self.current_api_key:
|
| 216 |
+
key_name = f"{provider.upper()}_API_KEY"
|
| 217 |
+
raise ValueError(
|
| 218 |
+
f"❌ 错误: 缺少 {key_name}。\n"
|
| 219 |
+
f" 当前选择的 LLM 供应商是: {provider}\n"
|
| 220 |
+
f" 请在 .env 文件中设置 {key_name},或更改 LLM_PROVIDER 为其他供应商。"
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# 2. 检查 SiliconCloud Key (Embedding 功能)
|
| 224 |
+
if not self.SILICON_API_KEY:
|
| 225 |
+
print("⚠️ 警告: 未找到 SILICON_API_KEY,向量检索功能可能无法工作。")
|
| 226 |
+
|
| 227 |
+
# 3. 检查 GitHub Token (可选但建议)
|
| 228 |
+
if not self.GITHUB_TOKEN:
|
| 229 |
+
print("⚠️ 警告: 未找到 GITHUB_TOKEN,GitHub API 请求将受到每小时 60 次的严格限制。")
|
| 230 |
+
|
| 231 |
+
print(f"✅ 配置验证通过 (Model: {self.default_model_name})")
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
# ============================================================
|
| 235 |
+
# 全局配置实例
|
| 236 |
+
# ============================================================
|
| 237 |
+
|
| 238 |
+
# LLM 设置
|
| 239 |
+
settings = Settings()
|
| 240 |
+
settings.validate()
|
| 241 |
+
|
| 242 |
+
# 子系统配置
|
| 243 |
+
agent_config = AgentAnalysisConfig()
|
| 244 |
+
vector_config = VectorServiceConfig()
|
| 245 |
+
conversation_config = ConversationConfig()
|
| 246 |
+
qdrant_config = QdrantServiceConfig()
|
app/main.py
ADDED
|
@@ -0,0 +1,560 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/main.py
|
| 2 |
+
import sys
|
| 3 |
+
import io
|
| 4 |
+
import os
|
| 5 |
+
import asyncio
|
| 6 |
+
from contextlib import asynccontextmanager
|
| 7 |
+
|
| 8 |
+
# 强制 stdout 使用 utf-8,防止 Windows 控制台乱码
|
| 9 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
| 10 |
+
|
| 11 |
+
from fastapi import FastAPI, Request
|
| 12 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 13 |
+
from sse_starlette.sse import EventSourceResponse
|
| 14 |
+
from fastapi.responses import StreamingResponse, HTMLResponse, JSONResponse
|
| 15 |
+
from fastapi.staticfiles import StaticFiles
|
| 16 |
+
import uvicorn
|
| 17 |
+
|
| 18 |
+
from app.core.config import settings
|
| 19 |
+
from app.services.agent_service import agent_stream
|
| 20 |
+
from app.services.chat_service import process_chat_stream, get_eval_data, clear_eval_data
|
| 21 |
+
from app.services.vector_service import store_manager
|
| 22 |
+
from app.services.auto_evaluation_service import (
|
| 23 |
+
init_auto_evaluation_service,
|
| 24 |
+
get_auto_evaluation_service,
|
| 25 |
+
EvaluationConfig
|
| 26 |
+
)
|
| 27 |
+
from evaluation.evaluation_framework import EvaluationEngine, EvaluationResult, DataRoutingEngine
|
| 28 |
+
from datetime import datetime
|
| 29 |
+
import uuid
|
| 30 |
+
|
| 31 |
+
settings.validate()
|
| 32 |
+
|
| 33 |
+
# === 生命周期管理 ===
|
| 34 |
+
@asynccontextmanager
|
| 35 |
+
async def lifespan(app: FastAPI):
|
| 36 |
+
"""应用生命周期管理"""
|
| 37 |
+
from app.services.vector_service import store_manager
|
| 38 |
+
|
| 39 |
+
# 启动时运行
|
| 40 |
+
print("🚀 Application starting...")
|
| 41 |
+
# 仓库数据永久存储,对话记忆纯内存存储(重启自动清空)
|
| 42 |
+
|
| 43 |
+
yield
|
| 44 |
+
|
| 45 |
+
# 关闭时运行
|
| 46 |
+
print("🛑 Application shutting down...")
|
| 47 |
+
|
| 48 |
+
# 清理 GitHub 客户端连接
|
| 49 |
+
from app.utils.github_client import close_github_client
|
| 50 |
+
await close_github_client()
|
| 51 |
+
|
| 52 |
+
# 清理向量存储连接
|
| 53 |
+
await store_manager.close_all()
|
| 54 |
+
|
| 55 |
+
# 关闭共享的 Qdrant 客户端
|
| 56 |
+
from app.storage.qdrant_store import close_shared_client
|
| 57 |
+
await close_shared_client()
|
| 58 |
+
|
| 59 |
+
print("✅ Cleanup complete")
|
| 60 |
+
|
| 61 |
+
app = FastAPI(title="GitHub RAG Agent", lifespan=lifespan)
|
| 62 |
+
|
| 63 |
+
# === 初始化评估引擎 ===
|
| 64 |
+
from app.utils.llm_client import client
|
| 65 |
+
eval_engine = EvaluationEngine(llm_client=client, model_name=settings.default_model_name)
|
| 66 |
+
data_router = DataRoutingEngine()
|
| 67 |
+
|
| 68 |
+
# === 初始化自动评估服务 (Phase 1) ===
|
| 69 |
+
auto_eval_config = EvaluationConfig(
|
| 70 |
+
enabled=True,
|
| 71 |
+
use_ragas=False, # Phase 1: 先不用 Ragas,避免额外依赖
|
| 72 |
+
async_evaluation=True, # 异步模式,不阻塞响应
|
| 73 |
+
min_quality_score=0.4, # 最低分数阈值(0.4 = 只拒绝最差的)
|
| 74 |
+
min_query_length=10, # 最小 query 长度
|
| 75 |
+
min_answer_length=100, # 最小 answer 长度
|
| 76 |
+
require_repo_url=True, # 必须有仓库 URL
|
| 77 |
+
require_code_in_context=True # 上下文必须包含代码
|
| 78 |
+
)
|
| 79 |
+
auto_eval_service = init_auto_evaluation_service(
|
| 80 |
+
eval_engine=eval_engine,
|
| 81 |
+
data_router=data_router,
|
| 82 |
+
config=auto_eval_config
|
| 83 |
+
)
|
| 84 |
+
print("✅ Auto Evaluation Service Initialized")
|
| 85 |
+
|
| 86 |
+
app.add_middleware(
|
| 87 |
+
CORSMiddleware,
|
| 88 |
+
allow_origins=["*"],
|
| 89 |
+
allow_credentials=True,
|
| 90 |
+
allow_methods=["*"],
|
| 91 |
+
allow_headers=["*"],
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# === 静态文件与前端 ===
|
| 95 |
+
app.mount("/static", StaticFiles(directory="app"), name="static")
|
| 96 |
+
|
| 97 |
+
# Vue 3 构建输出的静态资源 (JS/CSS/assets)
|
| 98 |
+
import os
|
| 99 |
+
FRONTEND_DIST = os.path.join(os.path.dirname(os.path.dirname(__file__)), "frontend-dist")
|
| 100 |
+
if os.path.exists(FRONTEND_DIST):
|
| 101 |
+
app.mount("/assets", StaticFiles(directory=os.path.join(FRONTEND_DIST, "assets")), name="vue-assets")
|
| 102 |
+
|
| 103 |
+
@app.get("/", response_class=HTMLResponse)
|
| 104 |
+
async def read_root():
|
| 105 |
+
# 优先使用 Vue 3 构建版本,否则回退到原版
|
| 106 |
+
vue_index = os.path.join(FRONTEND_DIST, "index.html")
|
| 107 |
+
if os.path.exists(vue_index):
|
| 108 |
+
with open(vue_index, "r", encoding="utf-8") as f:
|
| 109 |
+
return f.read()
|
| 110 |
+
# 回退到原版前端
|
| 111 |
+
with open("frontend/index.html", "r", encoding="utf-8") as f:
|
| 112 |
+
return f.read()
|
| 113 |
+
|
| 114 |
+
@app.get("/health")
|
| 115 |
+
def health_check():
|
| 116 |
+
return {"status": "ok"}
|
| 117 |
+
|
| 118 |
+
@app.get("/api/sessions")
|
| 119 |
+
async def get_sessions():
|
| 120 |
+
"""获取 session 管理状态"""
|
| 121 |
+
return JSONResponse(store_manager.get_stats())
|
| 122 |
+
|
| 123 |
+
@app.post("/api/sessions/cleanup")
|
| 124 |
+
async def trigger_cleanup():
|
| 125 |
+
"""手动触发过期文件清理"""
|
| 126 |
+
stats = await store_manager.cleanup_expired_files()
|
| 127 |
+
return JSONResponse({"message": "Cleanup completed", "stats": stats})
|
| 128 |
+
|
| 129 |
+
@app.delete("/api/sessions/{session_id}")
|
| 130 |
+
async def close_session(session_id: str):
|
| 131 |
+
"""关闭指定 session"""
|
| 132 |
+
await store_manager.close_session(session_id)
|
| 133 |
+
return JSONResponse({"message": f"Session {session_id} closed"})
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# === 仓库级 Session API ===
|
| 137 |
+
|
| 138 |
+
@app.post("/api/repo/check")
|
| 139 |
+
async def check_repo_session(request: Request):
|
| 140 |
+
"""
|
| 141 |
+
检查仓库是否已有指定语言的索引和报告
|
| 142 |
+
|
| 143 |
+
请求: { "url": "https://github.com/owner/repo", "language": "zh" }
|
| 144 |
+
响应: {
|
| 145 |
+
"exists": true/false,
|
| 146 |
+
"session_id": "repo_xxx",
|
| 147 |
+
"report": "..." (如果存在对应语言的报告),
|
| 148 |
+
"has_index": true/false,
|
| 149 |
+
"available_languages": ["en", "zh"]
|
| 150 |
+
}
|
| 151 |
+
"""
|
| 152 |
+
from app.utils.session import generate_repo_session_id
|
| 153 |
+
|
| 154 |
+
data = await request.json()
|
| 155 |
+
repo_url = data.get("url", "").strip()
|
| 156 |
+
language = data.get("language", "en")
|
| 157 |
+
|
| 158 |
+
if not repo_url:
|
| 159 |
+
return JSONResponse({"error": "Missing URL"}, status_code=400)
|
| 160 |
+
|
| 161 |
+
# 生成基于仓库的 Session ID
|
| 162 |
+
session_id = generate_repo_session_id(repo_url)
|
| 163 |
+
|
| 164 |
+
# 检查是否存在
|
| 165 |
+
store = store_manager.get_store(session_id)
|
| 166 |
+
|
| 167 |
+
# 尝试加载上下文
|
| 168 |
+
context = store.load_context()
|
| 169 |
+
|
| 170 |
+
if context and context.get("repo_url"):
|
| 171 |
+
# 存在已分析的仓库
|
| 172 |
+
# 获取指定语言的报告
|
| 173 |
+
report = store.get_report(language)
|
| 174 |
+
available_languages = store.get_available_languages()
|
| 175 |
+
global_context = context.get("global_context", {})
|
| 176 |
+
has_index = bool(global_context.get("file_tree"))
|
| 177 |
+
|
| 178 |
+
return JSONResponse({
|
| 179 |
+
"exists": True,
|
| 180 |
+
"session_id": session_id,
|
| 181 |
+
"repo_url": context.get("repo_url"),
|
| 182 |
+
"report": report, # 指定语言的报告,可能为 None
|
| 183 |
+
"has_index": has_index,
|
| 184 |
+
"available_languages": available_languages,
|
| 185 |
+
"requested_language": language,
|
| 186 |
+
})
|
| 187 |
+
else:
|
| 188 |
+
return JSONResponse({
|
| 189 |
+
"exists": False,
|
| 190 |
+
"session_id": session_id,
|
| 191 |
+
"has_index": False,
|
| 192 |
+
"available_languages": [],
|
| 193 |
+
})
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
@app.get("/analyze")
|
| 197 |
+
async def analyze(url: str, session_id: str, language: str = "en", regenerate_only: bool = False):
|
| 198 |
+
"""
|
| 199 |
+
仓库分析端点
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
url: 仓库 URL
|
| 203 |
+
session_id: Session ID
|
| 204 |
+
language: 报告语言 ("en" 或 "zh")
|
| 205 |
+
regenerate_only: True 时跳过抓取/索引,直接使用已有索引生成新语言报告
|
| 206 |
+
"""
|
| 207 |
+
if not session_id:
|
| 208 |
+
return {"error": "Missing session_id"}
|
| 209 |
+
return EventSourceResponse(agent_stream(url, session_id, language, regenerate_only))
|
| 210 |
+
|
| 211 |
+
@app.post("/chat")
|
| 212 |
+
async def chat(request: Request):
|
| 213 |
+
"""
|
| 214 |
+
聊天端点 - 自动评估版本
|
| 215 |
+
|
| 216 |
+
改进点:
|
| 217 |
+
1. 立即返回聊天结果(不阻塞)
|
| 218 |
+
2. 后台异步进行自动评估
|
| 219 |
+
3. 评估结果自动存储到 evaluation/sft_data/
|
| 220 |
+
"""
|
| 221 |
+
data = await request.json()
|
| 222 |
+
user_query = data.get("query")
|
| 223 |
+
session_id = data.get("session_id")
|
| 224 |
+
repo_url = data.get("repo_url", "")
|
| 225 |
+
|
| 226 |
+
if not user_query:
|
| 227 |
+
return {"answer": "Please enter your question"}
|
| 228 |
+
if not session_id:
|
| 229 |
+
return {"answer": "Session lost"}
|
| 230 |
+
|
| 231 |
+
# 标记流是否完成
|
| 232 |
+
stream_completed = False
|
| 233 |
+
|
| 234 |
+
async def chat_stream_with_eval():
|
| 235 |
+
"""包装 process_chat_stream,流结束后触发评估"""
|
| 236 |
+
nonlocal stream_completed
|
| 237 |
+
|
| 238 |
+
# 清除旧的评估数据
|
| 239 |
+
clear_eval_data(session_id)
|
| 240 |
+
|
| 241 |
+
# 执行聊天流
|
| 242 |
+
async for chunk in process_chat_stream(user_query, session_id):
|
| 243 |
+
yield chunk
|
| 244 |
+
|
| 245 |
+
# 流完成后标记
|
| 246 |
+
stream_completed = True
|
| 247 |
+
|
| 248 |
+
# 流结束后触发评估(此时数据已存储在 chat_service 中)
|
| 249 |
+
try:
|
| 250 |
+
auto_eval_service = get_auto_evaluation_service()
|
| 251 |
+
eval_data = get_eval_data(session_id)
|
| 252 |
+
|
| 253 |
+
if auto_eval_service and eval_data and eval_data.answer:
|
| 254 |
+
print(f"\n📊 [Auto-Eval] Starting evaluation for session {session_id}")
|
| 255 |
+
print(f" - Query: {user_query[:50]}...")
|
| 256 |
+
print(f" - Context length: {len(eval_data.retrieved_context)} chars")
|
| 257 |
+
print(f" - Answer length: {len(eval_data.answer)} chars")
|
| 258 |
+
|
| 259 |
+
# 异步执行评估(不阻塞流结束)
|
| 260 |
+
asyncio.create_task(
|
| 261 |
+
auto_eval_service.auto_evaluate_async(
|
| 262 |
+
query=user_query,
|
| 263 |
+
retrieved_context=eval_data.retrieved_context,
|
| 264 |
+
generated_answer=eval_data.answer,
|
| 265 |
+
session_id=session_id,
|
| 266 |
+
repo_url=repo_url,
|
| 267 |
+
language="zh" if any('\u4e00' <= c <= '\u9fff' for c in user_query) else "en"
|
| 268 |
+
)
|
| 269 |
+
)
|
| 270 |
+
else:
|
| 271 |
+
if not auto_eval_service:
|
| 272 |
+
print("⚠️ Auto evaluation service not initialized")
|
| 273 |
+
elif not eval_data:
|
| 274 |
+
print(f"⚠️ No eval data found for session {session_id}")
|
| 275 |
+
elif not eval_data.answer:
|
| 276 |
+
print(f"⚠️ Empty answer for session {session_id}")
|
| 277 |
+
except Exception as e:
|
| 278 |
+
print(f"⚠️ Failed to trigger auto-eval: {e}")
|
| 279 |
+
import traceback
|
| 280 |
+
traceback.print_exc()
|
| 281 |
+
|
| 282 |
+
# 返回流
|
| 283 |
+
return StreamingResponse(
|
| 284 |
+
chat_stream_with_eval(),
|
| 285 |
+
media_type="text/plain"
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
# ===== Phase 2: 新增评估端点 =====
|
| 289 |
+
|
| 290 |
+
@app.post("/evaluate")
|
| 291 |
+
async def evaluate(request: Request):
|
| 292 |
+
"""
|
| 293 |
+
评估端点: 接收生成结果,进行多维度评估
|
| 294 |
+
|
| 295 |
+
POST /evaluate
|
| 296 |
+
{
|
| 297 |
+
"query": "用户问题",
|
| 298 |
+
"retrieved_context": "检索到的文件内容",
|
| 299 |
+
"generated_answer": "生成的回答",
|
| 300 |
+
"session_id": "会话ID",
|
| 301 |
+
"repo_url": "仓库URL(可选)"
|
| 302 |
+
}
|
| 303 |
+
"""
|
| 304 |
+
try:
|
| 305 |
+
data = await request.json()
|
| 306 |
+
|
| 307 |
+
# 提取必需字段
|
| 308 |
+
query = data.get("query")
|
| 309 |
+
retrieved_context = data.get("retrieved_context", "")
|
| 310 |
+
generated_answer = data.get("generated_answer")
|
| 311 |
+
session_id = data.get("session_id", "unknown")
|
| 312 |
+
repo_url = data.get("repo_url", "")
|
| 313 |
+
|
| 314 |
+
if not query or not generated_answer:
|
| 315 |
+
return {
|
| 316 |
+
"error": "Missing required fields: query, generated_answer",
|
| 317 |
+
"status": "failed"
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
# 调用评估引擎获取生成层指标
|
| 321 |
+
generation_metrics = await eval_engine.evaluate_generation(
|
| 322 |
+
query=query,
|
| 323 |
+
retrieved_context=retrieved_context,
|
| 324 |
+
generated_answer=generated_answer
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# 构建完整的评估结果对象
|
| 328 |
+
evaluation_result = EvaluationResult(
|
| 329 |
+
session_id=session_id,
|
| 330 |
+
query=query,
|
| 331 |
+
repo_url=repo_url,
|
| 332 |
+
timestamp=datetime.now(),
|
| 333 |
+
language="en",
|
| 334 |
+
generation_metrics=generation_metrics
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
# 计算综合得分
|
| 338 |
+
evaluation_result.compute_overall_score()
|
| 339 |
+
|
| 340 |
+
# 数据路由: 根据得分将样本分类
|
| 341 |
+
quality_tier = data_router.route_sample(evaluation_result)
|
| 342 |
+
|
| 343 |
+
return {
|
| 344 |
+
"status": "success",
|
| 345 |
+
"evaluation": {
|
| 346 |
+
"faithfulness": generation_metrics.faithfulness,
|
| 347 |
+
"answer_relevance": generation_metrics.answer_relevance,
|
| 348 |
+
"answer_completeness": generation_metrics.answer_completeness,
|
| 349 |
+
"overall_score": evaluation_result.overall_score
|
| 350 |
+
},
|
| 351 |
+
"quality_tier": quality_tier,
|
| 352 |
+
"session_id": session_id
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
except Exception as e:
|
| 356 |
+
import traceback
|
| 357 |
+
traceback.print_exc()
|
| 358 |
+
return {
|
| 359 |
+
"error": str(e),
|
| 360 |
+
"status": "failed"
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
# ===== 自动评估相关端点 =====
|
| 365 |
+
|
| 366 |
+
@app.get("/auto-eval/review-queue")
|
| 367 |
+
async def get_review_queue():
|
| 368 |
+
"""
|
| 369 |
+
获取需要人工审查的样本列表
|
| 370 |
+
|
| 371 |
+
这些是评估出现异常(自己的分数和Ragas分数差异过大)的样本
|
| 372 |
+
需要人工判断哪个评估器更准确
|
| 373 |
+
|
| 374 |
+
GET /auto-eval/review-queue
|
| 375 |
+
"""
|
| 376 |
+
try:
|
| 377 |
+
auto_eval_service = get_auto_evaluation_service()
|
| 378 |
+
if not auto_eval_service:
|
| 379 |
+
return {"error": "Auto evaluation service not initialized", "status": "failed"}
|
| 380 |
+
|
| 381 |
+
queue = auto_eval_service.get_review_queue()
|
| 382 |
+
|
| 383 |
+
return {
|
| 384 |
+
"status": "success",
|
| 385 |
+
"queue_size": len(queue),
|
| 386 |
+
"samples": [
|
| 387 |
+
{
|
| 388 |
+
"index": i,
|
| 389 |
+
"query": item["eval_result"].query,
|
| 390 |
+
"custom_score": item["custom_score"],
|
| 391 |
+
"ragas_score": item["ragas_score"],
|
| 392 |
+
"diff": item["diff"],
|
| 393 |
+
"quality_tier": item["eval_result"].data_quality_tier.value,
|
| 394 |
+
"timestamp": item["timestamp"]
|
| 395 |
+
}
|
| 396 |
+
for i, item in enumerate(queue)
|
| 397 |
+
]
|
| 398 |
+
}
|
| 399 |
+
except Exception as e:
|
| 400 |
+
return {"error": str(e), "status": "failed"}
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
@app.post("/auto-eval/approve/{index}")
|
| 404 |
+
async def approve_sample(index: int):
|
| 405 |
+
"""
|
| 406 |
+
人工批准某个样本(接受该评估结果)
|
| 407 |
+
|
| 408 |
+
POST /auto-eval/approve/0
|
| 409 |
+
"""
|
| 410 |
+
try:
|
| 411 |
+
auto_eval_service = get_auto_evaluation_service()
|
| 412 |
+
if not auto_eval_service:
|
| 413 |
+
return {"error": "Auto evaluation service not initialized", "status": "failed"}
|
| 414 |
+
|
| 415 |
+
auto_eval_service.approve_sample(index)
|
| 416 |
+
|
| 417 |
+
return {
|
| 418 |
+
"status": "success",
|
| 419 |
+
"message": f"Sample {index} approved and stored"
|
| 420 |
+
}
|
| 421 |
+
except Exception as e:
|
| 422 |
+
return {"error": str(e), "status": "failed"}
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
@app.post("/auto-eval/reject/{index}")
|
| 426 |
+
async def reject_sample(index: int):
|
| 427 |
+
"""
|
| 428 |
+
人工拒绝某个样本(抛弃该评估结果)
|
| 429 |
+
|
| 430 |
+
POST /auto-eval/reject/0
|
| 431 |
+
"""
|
| 432 |
+
try:
|
| 433 |
+
auto_eval_service = get_auto_evaluation_service()
|
| 434 |
+
if not auto_eval_service:
|
| 435 |
+
return {"error": "Auto evaluation service not initialized", "status": "failed"}
|
| 436 |
+
|
| 437 |
+
auto_eval_service.reject_sample(index)
|
| 438 |
+
|
| 439 |
+
return {
|
| 440 |
+
"status": "success",
|
| 441 |
+
"message": f"Sample {index} rejected and removed from queue"
|
| 442 |
+
}
|
| 443 |
+
except Exception as e:
|
| 444 |
+
return {"error": str(e), "status": "failed"}
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
@app.get("/auto-eval/stats")
|
| 448 |
+
async def auto_eval_stats():
|
| 449 |
+
"""
|
| 450 |
+
获取自动评估统计信息
|
| 451 |
+
|
| 452 |
+
GET /auto-eval/stats
|
| 453 |
+
"""
|
| 454 |
+
try:
|
| 455 |
+
auto_eval_service = get_auto_evaluation_service()
|
| 456 |
+
if not auto_eval_service:
|
| 457 |
+
return {"error": "Auto evaluation service not initialized", "status": "failed"}
|
| 458 |
+
|
| 459 |
+
queue = auto_eval_service.get_review_queue()
|
| 460 |
+
|
| 461 |
+
return {
|
| 462 |
+
"status": "success",
|
| 463 |
+
"auto_evaluation": {
|
| 464 |
+
"enabled": auto_eval_service.config.enabled,
|
| 465 |
+
"use_ragas": auto_eval_service.config.use_ragas,
|
| 466 |
+
"async_mode": auto_eval_service.config.async_evaluation,
|
| 467 |
+
"custom_weight": auto_eval_service.config.custom_weight,
|
| 468 |
+
"ragas_weight": auto_eval_service.config.ragas_weight,
|
| 469 |
+
"diff_threshold": auto_eval_service.config.diff_threshold
|
| 470 |
+
},
|
| 471 |
+
"review_queue_size": len(queue),
|
| 472 |
+
"last_update": datetime.now().isoformat()
|
| 473 |
+
}
|
| 474 |
+
except Exception as e:
|
| 475 |
+
return {"error": str(e), "status": "failed"}
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
@app.get("/evaluation/stats")
|
| 479 |
+
async def evaluation_stats():
|
| 480 |
+
"""
|
| 481 |
+
获取评估统计信息
|
| 482 |
+
|
| 483 |
+
GET /evaluation/stats
|
| 484 |
+
"""
|
| 485 |
+
try:
|
| 486 |
+
stats = eval_engine.get_statistics()
|
| 487 |
+
return {
|
| 488 |
+
"status": "success",
|
| 489 |
+
"statistics": {
|
| 490 |
+
"total_evaluations": stats.get("total_evaluations", 0),
|
| 491 |
+
"average_score": stats.get("average_score", 0),
|
| 492 |
+
"quality_distribution": stats.get("quality_distribution", {}),
|
| 493 |
+
"top_issues": stats.get("top_issues", [])
|
| 494 |
+
}
|
| 495 |
+
}
|
| 496 |
+
except Exception as e:
|
| 497 |
+
return {
|
| 498 |
+
"error": str(e),
|
| 499 |
+
"status": "failed"
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
|
| 503 |
+
@app.get("/dashboard/quality-distribution")
|
| 504 |
+
async def quality_distribution():
|
| 505 |
+
"""
|
| 506 |
+
获取数据质量分布 (用于仪表盘)
|
| 507 |
+
|
| 508 |
+
GET /dashboard/quality-distribution
|
| 509 |
+
"""
|
| 510 |
+
try:
|
| 511 |
+
distribution = data_router.get_distribution()
|
| 512 |
+
return {
|
| 513 |
+
"status": "success",
|
| 514 |
+
"distribution": {
|
| 515 |
+
"gold": distribution.get("gold", 0),
|
| 516 |
+
"silver": distribution.get("silver", 0),
|
| 517 |
+
"bronze": distribution.get("bronze", 0),
|
| 518 |
+
"rejected": distribution.get("rejected", 0),
|
| 519 |
+
"corrected": distribution.get("corrected", 0)
|
| 520 |
+
},
|
| 521 |
+
"timestamp": datetime.now().isoformat()
|
| 522 |
+
}
|
| 523 |
+
except Exception as e:
|
| 524 |
+
return {
|
| 525 |
+
"error": str(e),
|
| 526 |
+
"status": "failed"
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
@app.get("/dashboard/bad-cases")
|
| 531 |
+
async def bad_cases():
|
| 532 |
+
"""
|
| 533 |
+
获取低质量样本 (用于人工审核)
|
| 534 |
+
|
| 535 |
+
GET /dashboard/bad-cases
|
| 536 |
+
"""
|
| 537 |
+
try:
|
| 538 |
+
bad_samples = data_router.get_bad_samples(limit=10)
|
| 539 |
+
return {
|
| 540 |
+
"status": "success",
|
| 541 |
+
"bad_cases": [
|
| 542 |
+
{
|
| 543 |
+
"query": s.get("query", ""),
|
| 544 |
+
"issue": s.get("issue", ""),
|
| 545 |
+
"score": s.get("score", 0)
|
| 546 |
+
}
|
| 547 |
+
for s in bad_samples
|
| 548 |
+
],
|
| 549 |
+
"total_bad_cases": len(bad_samples)
|
| 550 |
+
}
|
| 551 |
+
except Exception as e:
|
| 552 |
+
return {
|
| 553 |
+
"error": str(e),
|
| 554 |
+
"status": "failed"
|
| 555 |
+
}
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
if __name__ == "__main__":
|
| 559 |
+
# 生产模式建议关掉 reload
|
| 560 |
+
uvicorn.run("app.main:app", host=settings.HOST, port=settings.PORT, reload=False)
|
app/services/agent_service.py
ADDED
|
@@ -0,0 +1,779 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/services/agent_service.py
|
| 2 |
+
import json
|
| 3 |
+
import asyncio
|
| 4 |
+
import traceback
|
| 5 |
+
import re
|
| 6 |
+
import ast
|
| 7 |
+
import httpx
|
| 8 |
+
import time
|
| 9 |
+
from typing import Set, Tuple, List
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from app.core.config import settings, agent_config
|
| 12 |
+
from app.utils.llm_client import client
|
| 13 |
+
from app.utils.repo_lock import RepoLock
|
| 14 |
+
from app.services.github_service import get_repo_structure, get_file_content
|
| 15 |
+
from app.services.vector_service import store_manager
|
| 16 |
+
from app.services.chunking_service import UniversalChunker, ChunkingConfig
|
| 17 |
+
from app.services.tracing_service import tracing_service
|
| 18 |
+
from evaluation.evaluation_framework import EvaluationEngine, EvaluationResult, DataRoutingEngine
|
| 19 |
+
|
| 20 |
+
# === Helper: 鲁棒的 JSON 提取 ===
|
| 21 |
+
def extract_json_from_text(text):
|
| 22 |
+
try:
|
| 23 |
+
text = re.sub(r"^```(json)?|```$", "", text.strip(), flags=re.MULTILINE).strip()
|
| 24 |
+
return json.loads(text)
|
| 25 |
+
except:
|
| 26 |
+
pass
|
| 27 |
+
match = re.search(r"\[.*\]", text, re.DOTALL)
|
| 28 |
+
if match:
|
| 29 |
+
try: return json.loads(match.group(0))
|
| 30 |
+
except: pass
|
| 31 |
+
return []
|
| 32 |
+
|
| 33 |
+
# === 多语言符号提取 ===
|
| 34 |
+
def _extract_symbols(content, file_path):
|
| 35 |
+
"""
|
| 36 |
+
根据文件类型,智能提取 Class 和 Function 签名生成地图。
|
| 37 |
+
"""
|
| 38 |
+
ext = file_path.split('.')[-1].lower() if '.' in file_path else ""
|
| 39 |
+
|
| 40 |
+
# 1. Python 使用 AST (最准)
|
| 41 |
+
if ext == 'py':
|
| 42 |
+
return _extract_symbols_python(content)
|
| 43 |
+
|
| 44 |
+
# 2. 其他语言使用正则 (Java, TS, JS, Go, C++)
|
| 45 |
+
elif ext in ['java', 'ts', 'tsx', 'js', 'jsx', 'go', 'cpp', 'cs', 'rs']:
|
| 46 |
+
return _extract_symbols_regex(content, ext)
|
| 47 |
+
|
| 48 |
+
return []
|
| 49 |
+
|
| 50 |
+
def _extract_symbols_python(content):
|
| 51 |
+
try:
|
| 52 |
+
tree = ast.parse(content)
|
| 53 |
+
symbols = []
|
| 54 |
+
for node in tree.body:
|
| 55 |
+
if isinstance(node, ast.ClassDef):
|
| 56 |
+
symbols.append(f" [C] {node.name}")
|
| 57 |
+
for sub in node.body:
|
| 58 |
+
if isinstance(sub, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 59 |
+
if not sub.name.startswith("_") or sub.name == "__init__":
|
| 60 |
+
symbols.append(f" - {sub.name}")
|
| 61 |
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 62 |
+
symbols.append(f" [F] {node.name}")
|
| 63 |
+
return symbols
|
| 64 |
+
except:
|
| 65 |
+
return []
|
| 66 |
+
|
| 67 |
+
def _extract_symbols_regex(content, ext):
|
| 68 |
+
"""
|
| 69 |
+
针对类 C 语言的通用正则提取。
|
| 70 |
+
"""
|
| 71 |
+
symbols = []
|
| 72 |
+
lines = content.split('\n')
|
| 73 |
+
|
| 74 |
+
# 定义各语言的正则模式
|
| 75 |
+
patterns = {
|
| 76 |
+
'java': {
|
| 77 |
+
'class': re.compile(r'(?:public|protected|private)?\s*(?:static|abstract)?\s*(?:class|interface|enum)\s+([a-zA-Z0-9_]+)'),
|
| 78 |
+
'func': re.compile(r'(?:public|protected|private)\s+(?:static\s+)?[\w<>[\]]+\s+([a-zA-Z0-9_]+)\s*\(')
|
| 79 |
+
},
|
| 80 |
+
'ts': {
|
| 81 |
+
'class': re.compile(r'class\s+([a-zA-Z0-9_]+)'),
|
| 82 |
+
'func': re.compile(r'(?:function\s+([a-zA-Z0-9_]+)|const\s+([a-zA-Z0-9_]+)\s*=\s*(?:async\s*)?\(|([a-zA-Z0-9_]+)\s*\([^)]*\)\s*[:\{])')
|
| 83 |
+
},
|
| 84 |
+
'go': {
|
| 85 |
+
'class': re.compile(r'type\s+([a-zA-Z0-9_]+)\s+(?:struct|interface)'),
|
| 86 |
+
'func': re.compile(r'func\s+(?:(?:\(.*\)\s+)?([a-zA-Z0-9_]+)|([a-zA-Z0-9_]+)\()')
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
lang_key = 'java' if ext in ['java', 'cs', 'cpp', 'rs'] else 'go' if ext == 'go' else 'ts'
|
| 91 |
+
rules = patterns.get(lang_key, patterns['java'])
|
| 92 |
+
|
| 93 |
+
count = 0
|
| 94 |
+
for line in lines:
|
| 95 |
+
line = line.strip()
|
| 96 |
+
# === 正则解析优化 (过滤更多干扰项) ===
|
| 97 |
+
if not line or line.startswith(("//", "/*", "*", "#", "print", "console.")): continue
|
| 98 |
+
if count > agent_config.max_symbols_per_file: break
|
| 99 |
+
|
| 100 |
+
# 匹配类
|
| 101 |
+
c_match = rules['class'].search(line)
|
| 102 |
+
if c_match:
|
| 103 |
+
name = next((g for g in c_match.groups() if g), "Unknown")
|
| 104 |
+
symbols.append(f" [C] {name}")
|
| 105 |
+
count += 1
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
# 匹配方法
|
| 109 |
+
if line.endswith('{') or "=>" in line:
|
| 110 |
+
f_match = rules['func'].search(line)
|
| 111 |
+
if f_match:
|
| 112 |
+
name = next((g for g in f_match.groups() if g), None)
|
| 113 |
+
# 增强过滤
|
| 114 |
+
if name and len(name) > 2 and name not in ['if', 'for', 'switch', 'while', 'catch', 'return']:
|
| 115 |
+
symbols.append(f" - {name}")
|
| 116 |
+
count += 1
|
| 117 |
+
|
| 118 |
+
return symbols
|
| 119 |
+
|
| 120 |
+
async def generate_repo_map(repo_url, file_list, limit=agent_config.initial_map_limit) -> Tuple[str, Set[str]]:
|
| 121 |
+
"""
|
| 122 |
+
生成增强版仓库地图 (多语言版)
|
| 123 |
+
Returns:
|
| 124 |
+
str: 地图字符串
|
| 125 |
+
set: 已包含在地图中的文件路径集合 (用于增量更新查重)
|
| 126 |
+
"""
|
| 127 |
+
# === 扩展高优先级文件列表 (使用配置) ===
|
| 128 |
+
priority_files = [
|
| 129 |
+
f for f in file_list
|
| 130 |
+
if f.endswith(agent_config.priority_exts) and
|
| 131 |
+
(f.count('/') <= 2 or any(k in f.lower() for k in agent_config.priority_keywords))
|
| 132 |
+
]
|
| 133 |
+
|
| 134 |
+
# 去重并截取
|
| 135 |
+
targets = sorted(list(set(priority_files)))[:limit]
|
| 136 |
+
remaining = [f for f in file_list if f not in targets]
|
| 137 |
+
|
| 138 |
+
repo_map_lines = []
|
| 139 |
+
mapped_files_set = set(targets) # === 记录已映射的文件 ===
|
| 140 |
+
|
| 141 |
+
async def process_file(path):
|
| 142 |
+
content = await get_file_content(repo_url, path)
|
| 143 |
+
if not content: return f"{path} (Read Failed)"
|
| 144 |
+
|
| 145 |
+
symbols = await asyncio.to_thread(_extract_symbols, content, path)
|
| 146 |
+
|
| 147 |
+
if symbols:
|
| 148 |
+
return f"{path}\n" + "\n".join(symbols)
|
| 149 |
+
return path
|
| 150 |
+
|
| 151 |
+
repo_map_lines.append(f"--- Key Files Structure (Top {len(targets)}) ---")
|
| 152 |
+
|
| 153 |
+
tasks = [process_file(f) for f in targets]
|
| 154 |
+
results = await asyncio.gather(*tasks)
|
| 155 |
+
repo_map_lines.extend(results)
|
| 156 |
+
|
| 157 |
+
if remaining:
|
| 158 |
+
repo_map_lines.append("\n--- Other Files ---")
|
| 159 |
+
if len(remaining) > 300:
|
| 160 |
+
repo_map_lines.extend(remaining[:300])
|
| 161 |
+
repo_map_lines.append(f"... ({len(remaining)-300} more files)")
|
| 162 |
+
else:
|
| 163 |
+
repo_map_lines.extend(remaining)
|
| 164 |
+
|
| 165 |
+
return "\n".join(repo_map_lines), mapped_files_set
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
async def agent_stream(repo_url: str, session_id: str, language: str = "en", regenerate_only: bool = False):
|
| 169 |
+
"""
|
| 170 |
+
主分析流程。
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
repo_url: GitHub 仓库 URL
|
| 174 |
+
session_id: 会话 ID
|
| 175 |
+
language: 报告语言 (zh/en)
|
| 176 |
+
regenerate_only: 如果为 True,跳过索引步骤,直接使用已有数据生成新语言报告
|
| 177 |
+
"""
|
| 178 |
+
short_id = session_id[-6:] if session_id else "unknown"
|
| 179 |
+
|
| 180 |
+
# === 追踪初始化 ===
|
| 181 |
+
trace_id = tracing_service.start_trace(
|
| 182 |
+
trace_name="agent_analysis",
|
| 183 |
+
session_id=session_id,
|
| 184 |
+
metadata={"repo_url": repo_url, "language": language, "regenerate_only": regenerate_only}
|
| 185 |
+
)
|
| 186 |
+
start_time = time.time()
|
| 187 |
+
|
| 188 |
+
# === 检查是否有其他用户正在分析同一仓库 ===
|
| 189 |
+
if not regenerate_only:
|
| 190 |
+
if await RepoLock.is_locked(session_id):
|
| 191 |
+
yield json.dumps({
|
| 192 |
+
"step": "waiting",
|
| 193 |
+
"message": f"⏳ Another user is analyzing this repository. Please wait..."
|
| 194 |
+
})
|
| 195 |
+
|
| 196 |
+
# === 获取仓库锁 (仅写操作需要) ===
|
| 197 |
+
try:
|
| 198 |
+
async with RepoLock.acquire(session_id):
|
| 199 |
+
async for event in _agent_stream_inner(
|
| 200 |
+
repo_url, session_id, language, regenerate_only,
|
| 201 |
+
short_id, trace_id, start_time
|
| 202 |
+
):
|
| 203 |
+
yield event
|
| 204 |
+
except TimeoutError as e:
|
| 205 |
+
yield json.dumps({
|
| 206 |
+
"step": "error",
|
| 207 |
+
"message": f"❌ {str(e)}. The repository is being analyzed by another user."
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
async def _agent_stream_inner(
|
| 212 |
+
repo_url: str, session_id: str, language: str, regenerate_only: bool,
|
| 213 |
+
short_id: str, trace_id: str, start_time: float
|
| 214 |
+
):
|
| 215 |
+
"""
|
| 216 |
+
实际的分析流程 (在锁保护下执行)
|
| 217 |
+
"""
|
| 218 |
+
try:
|
| 219 |
+
vector_db = store_manager.get_store(session_id)
|
| 220 |
+
|
| 221 |
+
# 调试日志:确认 session 隔离
|
| 222 |
+
print(f"🔍 [DEBUG] session_id: {session_id}, collection: {vector_db.collection_name}, context_file: {vector_db._context_file}")
|
| 223 |
+
|
| 224 |
+
# === regenerate_only 模式:跳过索引,直接生成报告 ===
|
| 225 |
+
if regenerate_only:
|
| 226 |
+
yield json.dumps({"step": "init", "message": f"🔄 [Session: {short_id}] Regenerating report in {language}..."})
|
| 227 |
+
await asyncio.sleep(0.3)
|
| 228 |
+
|
| 229 |
+
# 从已有索引加载上下文
|
| 230 |
+
context = vector_db.load_context()
|
| 231 |
+
if not context:
|
| 232 |
+
yield json.dumps({"step": "error", "message": "❌ No existing index found. Please analyze the repository first."})
|
| 233 |
+
return
|
| 234 |
+
|
| 235 |
+
# 正确读取 global_context 内的字段
|
| 236 |
+
global_ctx = context.get("global_context", {})
|
| 237 |
+
file_tree_str = global_ctx.get("file_tree", "")
|
| 238 |
+
context_summary = global_ctx.get("summary", "")
|
| 239 |
+
visited_files = set() # regenerate 模式不需要这个,但报告生成需要引用
|
| 240 |
+
|
| 241 |
+
# 验证上下文与请求的仓库匹配
|
| 242 |
+
stored_repo_url = context.get("repo_url", "")
|
| 243 |
+
if stored_repo_url and repo_url not in stored_repo_url and stored_repo_url not in repo_url:
|
| 244 |
+
print(f"⚠️ [WARNING] repo_url mismatch! Request: {repo_url}, Stored: {stored_repo_url}")
|
| 245 |
+
|
| 246 |
+
yield json.dumps({"step": "generating", "message": f"📝 Generating report in {'Chinese' if language == 'zh' else 'English'}..."})
|
| 247 |
+
else:
|
| 248 |
+
# === 正常分析模式 ===
|
| 249 |
+
yield json.dumps({"step": "init", "message": f"🚀 [Session: {short_id}] Connecting to GitHub..."})
|
| 250 |
+
await asyncio.sleep(0.5)
|
| 251 |
+
|
| 252 |
+
await vector_db.reset() # 使用异步方法
|
| 253 |
+
|
| 254 |
+
chunker = UniversalChunker(config=ChunkingConfig(min_chunk_size=50))
|
| 255 |
+
|
| 256 |
+
file_list = await get_repo_structure(repo_url)
|
| 257 |
+
if not file_list:
|
| 258 |
+
raise Exception("Repository is empty or unreadable.")
|
| 259 |
+
|
| 260 |
+
yield json.dumps({"step": "fetched", "message": f"📦 Found {len(file_list)} files. Building Repo Map (AST Parsing)..."})
|
| 261 |
+
|
| 262 |
+
# === 接收 mapped_files 用于后续查重 + 计时 ===
|
| 263 |
+
map_start = time.time()
|
| 264 |
+
file_tree_str, mapped_files = await generate_repo_map(repo_url, file_list, limit=agent_config.initial_map_limit)
|
| 265 |
+
map_latency_ms = (time.time() - map_start) * 1000
|
| 266 |
+
tracing_service.add_event("repo_map_generated", {"latency_ms": map_latency_ms, "files_mapped": len(mapped_files)})
|
| 267 |
+
|
| 268 |
+
visited_files = set()
|
| 269 |
+
context_summary = ""
|
| 270 |
+
readme_file = next((f for f in file_list if f.lower().endswith("readme.md")), None)
|
| 271 |
+
|
| 272 |
+
for round_idx in range(agent_config.max_rounds):
|
| 273 |
+
yield json.dumps({"step": "thinking", "message": f"🕵️ [Round {round_idx+1}/{agent_config.max_rounds}] DeepSeek is analyzing Repo Map..."})
|
| 274 |
+
|
| 275 |
+
system_prompt = "You are a Senior Software Architect. Your goal is to understand the codebase."
|
| 276 |
+
user_content = f"""
|
| 277 |
+
[Project Repo Map]
|
| 278 |
+
(Contains file paths and key Class/Function signatures)
|
| 279 |
+
{file_tree_str}
|
| 280 |
+
|
| 281 |
+
[Files Already Read]
|
| 282 |
+
{list(visited_files)}
|
| 283 |
+
|
| 284 |
+
[Current Knowledge]
|
| 285 |
+
{context_summary}
|
| 286 |
+
|
| 287 |
+
[Task]
|
| 288 |
+
Select 1-{agent_config.files_per_round} MOST CRITICAL files to read next to understand the core logic.
|
| 289 |
+
Focus on files that seem to contain main logic based on the Repo Map symbols.
|
| 290 |
+
|
| 291 |
+
[Constraint]
|
| 292 |
+
Return ONLY a raw JSON list of strings. No markdown.
|
| 293 |
+
Example: ["src/main.py", "app/auth.py"]
|
| 294 |
+
"""
|
| 295 |
+
|
| 296 |
+
if not client:
|
| 297 |
+
yield json.dumps({"step": "error", "message": "❌ LLM Client Not Initialized."})
|
| 298 |
+
return
|
| 299 |
+
|
| 300 |
+
# === Token & Latency Tracing ===
|
| 301 |
+
llm_start_time = time.time()
|
| 302 |
+
plan_messages = [
|
| 303 |
+
{"role": "system", "content": system_prompt},
|
| 304 |
+
{"role": "user", "content": user_content}
|
| 305 |
+
]
|
| 306 |
+
|
| 307 |
+
response = await client.chat.completions.create(
|
| 308 |
+
model=settings.default_model_name,
|
| 309 |
+
messages=plan_messages,
|
| 310 |
+
temperature=0.1,
|
| 311 |
+
timeout=settings.LLM_TIMEOUT
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
llm_latency_ms = (time.time() - llm_start_time) * 1000
|
| 315 |
+
raw_content = response.choices[0].message.content
|
| 316 |
+
|
| 317 |
+
# 记录 Token 使用量
|
| 318 |
+
usage = getattr(response, 'usage', None)
|
| 319 |
+
tracing_service.record_llm_generation(
|
| 320 |
+
model=settings.default_model_name,
|
| 321 |
+
prompt_messages=plan_messages,
|
| 322 |
+
generated_text=raw_content,
|
| 323 |
+
total_latency_ms=llm_latency_ms,
|
| 324 |
+
prompt_tokens=usage.prompt_tokens if usage else None,
|
| 325 |
+
completion_tokens=usage.completion_tokens if usage else None,
|
| 326 |
+
total_tokens=usage.total_tokens if usage else None,
|
| 327 |
+
is_streaming=False,
|
| 328 |
+
metadata={"step": "file_selection", "round": round_idx + 1}
|
| 329 |
+
)
|
| 330 |
+
target_files = extract_json_from_text(raw_content)
|
| 331 |
+
|
| 332 |
+
valid_files = [f for f in target_files if f in file_list and f not in visited_files]
|
| 333 |
+
|
| 334 |
+
if round_idx == 0 and readme_file and readme_file not in visited_files and readme_file not in valid_files:
|
| 335 |
+
valid_files.insert(0, readme_file)
|
| 336 |
+
|
| 337 |
+
if not valid_files:
|
| 338 |
+
yield json.dumps({"step": "plan", "message": f"🛑 [Round {round_idx+1}] Sufficient context gathered."})
|
| 339 |
+
break
|
| 340 |
+
|
| 341 |
+
yield json.dumps({"step": "plan", "message": f"👉 [Round {round_idx+1}] Selected: {valid_files}"})
|
| 342 |
+
|
| 343 |
+
# === 并发模型缺陷优化 (并行下载处理) ===
|
| 344 |
+
async def process_single_file(file_path):
|
| 345 |
+
try:
|
| 346 |
+
file_start = time.time()
|
| 347 |
+
|
| 348 |
+
# 🔧 异步 GitHub API (已优化为非阻塞)
|
| 349 |
+
content = await get_file_content(repo_url, file_path)
|
| 350 |
+
if not content:
|
| 351 |
+
tracing_service.add_event("file_read_failed", {"file": file_path})
|
| 352 |
+
return None
|
| 353 |
+
|
| 354 |
+
# 1. 摘要与 Context
|
| 355 |
+
lines = content.split('\n')[:50]
|
| 356 |
+
preview = "\n".join(lines)
|
| 357 |
+
file_knowledge = f"\n--- File: {file_path} ---\n{preview}\n"
|
| 358 |
+
|
| 359 |
+
# 2. Repo Map 增量更新与查重
|
| 360 |
+
new_map_entry = None
|
| 361 |
+
if file_path not in mapped_files:
|
| 362 |
+
symbols = await asyncio.to_thread(_extract_symbols, content, file_path)
|
| 363 |
+
if symbols:
|
| 364 |
+
new_map_entry = f"{file_path}\n" + "\n".join(symbols)
|
| 365 |
+
|
| 366 |
+
# 3. 切片与入库
|
| 367 |
+
chunks = await asyncio.to_thread(chunker.chunk_file, content, file_path)
|
| 368 |
+
if chunks:
|
| 369 |
+
documents = [c["content"] for c in chunks]
|
| 370 |
+
metadatas = []
|
| 371 |
+
for c in chunks:
|
| 372 |
+
meta = c["metadata"]
|
| 373 |
+
metadatas.append({
|
| 374 |
+
"file": meta["file"],
|
| 375 |
+
"type": meta["type"],
|
| 376 |
+
"name": meta.get("name", ""),
|
| 377 |
+
"class": meta.get("class") or ""
|
| 378 |
+
})
|
| 379 |
+
if documents:
|
| 380 |
+
try:
|
| 381 |
+
await vector_db.add_documents(documents, metadatas)
|
| 382 |
+
except Exception as e:
|
| 383 |
+
print(f"❌ 索引错误 {file_path}: {e}")
|
| 384 |
+
# 不中断,继续处理其他文件
|
| 385 |
+
return None
|
| 386 |
+
|
| 387 |
+
file_latency_ms = (time.time() - file_start) * 1000
|
| 388 |
+
tracing_service.add_event("file_processed", {
|
| 389 |
+
"file": file_path,
|
| 390 |
+
"latency_ms": file_latency_ms,
|
| 391 |
+
"chunks_count": len(chunks) if chunks else 0
|
| 392 |
+
})
|
| 393 |
+
|
| 394 |
+
return {
|
| 395 |
+
"path": file_path,
|
| 396 |
+
"knowledge": file_knowledge,
|
| 397 |
+
"map_entry": new_map_entry
|
| 398 |
+
}
|
| 399 |
+
except Exception as e:
|
| 400 |
+
print(f"❌ 处理文件错误 {file_path}: {e}")
|
| 401 |
+
return None
|
| 402 |
+
|
| 403 |
+
# 提示开始并发下载
|
| 404 |
+
yield json.dumps({"step": "download", "message": f"📥 Starting parallel download for {len(valid_files)} files..."})
|
| 405 |
+
|
| 406 |
+
# 启动并发任务 (return_exceptions=True 防止单个失败导致整个中断)
|
| 407 |
+
tasks = [process_single_file(f) for f in valid_files]
|
| 408 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 409 |
+
|
| 410 |
+
# 聚合结果
|
| 411 |
+
download_count = 0
|
| 412 |
+
for res in results:
|
| 413 |
+
if not res or isinstance(res, Exception):
|
| 414 |
+
if isinstance(res, Exception):
|
| 415 |
+
print(f"❌ Task 异常: {res}")
|
| 416 |
+
continue
|
| 417 |
+
download_count += 1
|
| 418 |
+
visited_files.add(res["path"])
|
| 419 |
+
context_summary += res["knowledge"]
|
| 420 |
+
|
| 421 |
+
# 增量更新 Map
|
| 422 |
+
if res["map_entry"]:
|
| 423 |
+
file_tree_str = f"{res['map_entry']}\n\n{file_tree_str}"
|
| 424 |
+
mapped_files.add(res["path"])
|
| 425 |
+
|
| 426 |
+
# === 硬编码截断解耦 ===
|
| 427 |
+
context_summary = context_summary[:agent_config.max_context_length]
|
| 428 |
+
|
| 429 |
+
global_context_data = {
|
| 430 |
+
"file_tree": file_tree_str,
|
| 431 |
+
"summary": context_summary[:8000]
|
| 432 |
+
}
|
| 433 |
+
await vector_db.save_context(repo_url, global_context_data)
|
| 434 |
+
|
| 435 |
+
yield json.dumps({"step": "indexing", "message": f"🧠 [Round {round_idx+1}] Processed {download_count} files. Knowledge graph updated."})
|
| 436 |
+
|
| 437 |
+
# Final Report (正常分析模式下的提示)
|
| 438 |
+
yield json.dumps({"step": "generating", "message": "📝 Generating technical report..."})
|
| 439 |
+
|
| 440 |
+
# === 报告生成 (两种模式共用) ===
|
| 441 |
+
|
| 442 |
+
# === P0: 向量检索补充关键代码片段 ===
|
| 443 |
+
yield json.dumps({"step": "enriching", "message": "🔍 Retrieving key code snippets..."})
|
| 444 |
+
|
| 445 |
+
key_queries = [
|
| 446 |
+
"main entry point initialization startup",
|
| 447 |
+
"core business logic handler processor",
|
| 448 |
+
"API routes endpoints controllers",
|
| 449 |
+
"database models schema ORM",
|
| 450 |
+
"authentication authorization middleware"
|
| 451 |
+
]
|
| 452 |
+
|
| 453 |
+
retrieved_snippets = []
|
| 454 |
+
try:
|
| 455 |
+
await vector_db.initialize()
|
| 456 |
+
for query in key_queries:
|
| 457 |
+
results = await vector_db.search_hybrid(query, top_k=2)
|
| 458 |
+
for r in results:
|
| 459 |
+
snippet = r.get("content", "")[:400]
|
| 460 |
+
file_path = r.get("file", "unknown")
|
| 461 |
+
if snippet and snippet not in [s.split("]")[1] if "]" in s else s for s in retrieved_snippets]:
|
| 462 |
+
retrieved_snippets.append(f"[{file_path}]\n{snippet}")
|
| 463 |
+
except Exception as e:
|
| 464 |
+
print(f"⚠️ 向量检索失败: {e}")
|
| 465 |
+
|
| 466 |
+
code_snippets_section = "\n\n".join(retrieved_snippets[:8]) if retrieved_snippets else ""
|
| 467 |
+
|
| 468 |
+
# === P1: 依赖文件解析 ===
|
| 469 |
+
dep_files = ["requirements.txt", "pyproject.toml", "package.json", "go.mod", "Cargo.toml", "pom.xml", "build.gradle"]
|
| 470 |
+
dependencies_info = ""
|
| 471 |
+
|
| 472 |
+
# 获取 file_list(regenerate_only 模式下需要重新获取)
|
| 473 |
+
if regenerate_only:
|
| 474 |
+
try:
|
| 475 |
+
temp_file_list = await get_repo_structure(repo_url)
|
| 476 |
+
except:
|
| 477 |
+
temp_file_list = []
|
| 478 |
+
else:
|
| 479 |
+
temp_file_list = file_list if 'file_list' in dir() else []
|
| 480 |
+
|
| 481 |
+
for dep_file in dep_files:
|
| 482 |
+
matching = [f for f in temp_file_list if f.endswith(dep_file)]
|
| 483 |
+
for f in matching[:1]: # 只取第一个匹配
|
| 484 |
+
try:
|
| 485 |
+
content = await get_file_content(repo_url, f)
|
| 486 |
+
if content:
|
| 487 |
+
dependencies_info += f"\n[{f}]\n{content[:800]}\n"
|
| 488 |
+
except:
|
| 489 |
+
pass
|
| 490 |
+
|
| 491 |
+
# 构建增强的上下文
|
| 492 |
+
enhanced_context = f"""
|
| 493 |
+
{context_summary[:12000]}
|
| 494 |
+
|
| 495 |
+
[Key Code Snippets (Retrieved by Semantic Search)]
|
| 496 |
+
{code_snippets_section}
|
| 497 |
+
|
| 498 |
+
[Project Dependencies]
|
| 499 |
+
{dependencies_info if dependencies_info else "No dependency file found."}
|
| 500 |
+
"""
|
| 501 |
+
|
| 502 |
+
repo_map_injection = f"""
|
| 503 |
+
[Project Repo Map (Structure)]
|
| 504 |
+
{file_tree_str}
|
| 505 |
+
"""
|
| 506 |
+
|
| 507 |
+
# === 根据语言选择 Prompt ===
|
| 508 |
+
if language == "zh":
|
| 509 |
+
# --- 中文 Prompt ---
|
| 510 |
+
system_role = "你是一位务实的技术专家。目标是为开发者创建一个'3页纸'架构概览,让他们能在5分钟内看懂这个仓库。重点关注架构和数据流,不要纠结细节。"
|
| 511 |
+
analysis_user_content = f"""
|
| 512 |
+
[角色]
|
| 513 |
+
你是一位务实的技术专家(Tech Lead)。
|
| 514 |
+
|
| 515 |
+
[输入数据]
|
| 516 |
+
{repo_map_injection}
|
| 517 |
+
|
| 518 |
+
分析的文件: {list(visited_files)}
|
| 519 |
+
|
| 520 |
+
[代码知识库与关键片段]
|
| 521 |
+
{enhanced_context}
|
| 522 |
+
|
| 523 |
+
[严格限制]
|
| 524 |
+
1. **不进行代码审查**: 不要列出 Bug、缺失功能或改进建议。
|
| 525 |
+
2. **不评价**: 不要评价代码质量,只描述它**如何工作**。
|
| 526 |
+
3. **语调**: 专业、结构化、描述性。使用中文回答。
|
| 527 |
+
4. **不要废话**: 不要写"安全性"、"未来规划"等未请求的章节。
|
| 528 |
+
|
| 529 |
+
[输出格式要求 (Markdown)]
|
| 530 |
+
|
| 531 |
+
# 项目分析报告
|
| 532 |
+
|
| 533 |
+
## 1. 执行摘要 (Executive Summary)
|
| 534 |
+
- **用途**: (这个项目具体解决什么问题?1-2句话)
|
| 535 |
+
- **核心功能**: (列出Top 3功能点)
|
| 536 |
+
- **技术栈**: (语言、框架、数据库、关键库)
|
| 537 |
+
|
| 538 |
+
## 2. 系统架构 (Mermaid)
|
| 539 |
+
创建一个 `graph TD` 图。
|
| 540 |
+
- 展示高层组件 (如 Client, API Server, Database, Worker, External Service)。
|
| 541 |
+
- 在连线上标注数据流 (如 "HTTP", "SQL")。
|
| 542 |
+
- **风格**: 保持概念清晰简单,节点数量控制在 8 个以内。
|
| 543 |
+
|
| 544 |
+
**⚠️ Mermaid 语法严格要求 (v10.x)**:
|
| 545 |
+
1. **所有节点文本必须用双引号包裹**: `A["用户界面"]` ✓, `A[用户界面]` ✗
|
| 546 |
+
2. **所有连线标签必须用双引号包裹**: `-->|"HTTP请求"|` ✓, `-->|HTTP请求|` ✗
|
| 547 |
+
3. **禁止使用特殊字符**: 不要在文本中使用 `<br/>`, `/`, `(`, `)`, `&`, `<`, `>` 等
|
| 548 |
+
4. **使用简短英文ID**: 节点ID用简短英文如 `A`, `B`, `Client`, `API`
|
| 549 |
+
5. **subgraph 标题也需引号**: `subgraph "核心服务"` ✓
|
| 550 |
+
6. **数据库节点**: 使用 `[("数据库")]` 格式
|
| 551 |
+
|
| 552 |
+
- **正确示例**:
|
| 553 |
+
```mermaid
|
| 554 |
+
graph TD
|
| 555 |
+
Client["客户端"] -->|"HTTP请求"| API["API网关"]
|
| 556 |
+
API --> Service["业务服务"]
|
| 557 |
+
Service --> DB[("数据库")]
|
| 558 |
+
Service -->|"调用"| External["外部服务"]
|
| 559 |
+
```
|
| 560 |
+
|
| 561 |
+
## 3. 核心逻辑分析 (Table)
|
| 562 |
+
(总结关键模块,不要列出所有文件,只列最重要的)
|
| 563 |
+
|
| 564 |
+
| 组件/文件 | 职责 (它做什么?) | 关键设计模式/逻辑 |
|
| 565 |
+
| :--- | :--- | :--- |
|
| 566 |
+
| 例如 `auth_service.py` | 处理JWT颁发与验证 | 单例模式, 路由装饰器 |
|
| 567 |
+
| ... | ... | ... |
|
| 568 |
+
|
| 569 |
+
## 4. 🔬 核心方法深度解析
|
| 570 |
+
(精选 3-5 个最关键的 `.py` 文件。针对每个文件,列出驱动逻辑的 Top 2-3 个方法)
|
| 571 |
+
|
| 572 |
+
### 4.1 `[文件名]`
|
| 573 |
+
* **`[方法名]`**: [解释它做什么以及为什么重要,不要贴代码]
|
| 574 |
+
* **`[方法名]`**: [解释...]
|
| 575 |
+
|
| 576 |
+
## 5. 主要工作流 (Mermaid)
|
| 577 |
+
选择**一个最重要**的业务流程 (Happy Path)。
|
| 578 |
+
创建一个 `sequenceDiagram`。
|
| 579 |
+
- 参与者应该是高层概念 (如 User, API, DB),不要用具体变量名。
|
| 580 |
+
|
| 581 |
+
**⚠️ sequenceDiagram 语法要求**:
|
| 582 |
+
1. **participant 别名格式**: `participant API as "API服务"` ✓
|
| 583 |
+
2. **消息文本用双引号**: `User->>API: "发起请求"` ✓
|
| 584 |
+
3. **避免特殊字符**: 不要在消息中使用 `/`, `&`, `<`, `>` 等
|
| 585 |
+
|
| 586 |
+
- **正确示例**:
|
| 587 |
+
```mermaid
|
| 588 |
+
sequenceDiagram
|
| 589 |
+
participant User as "用户"
|
| 590 |
+
participant API as "API服务"
|
| 591 |
+
participant DB as "数据库"
|
| 592 |
+
User->>API: "发起请求"
|
| 593 |
+
API->>DB: "查询数据"
|
| 594 |
+
DB-->>API: "返回结果"
|
| 595 |
+
API-->>User: "响应数据"
|
| 596 |
+
```
|
| 597 |
+
|
| 598 |
+
## 6. 快速开始 (Quick Start)
|
| 599 |
+
- **前置条件**: (如 Docker, Python 3.9+, .env 配置)
|
| 600 |
+
- **入口**: (如何启动主逻辑?如 `python main.py`)
|
| 601 |
+
"""
|
| 602 |
+
else:
|
| 603 |
+
analysis_user_content = f"""
|
| 604 |
+
[Role]
|
| 605 |
+
You are a **Pragmatic Tech Lead**. Your goal is to create a **"3-Pages" Architecture Overview** for a developer who wants to understand this repo in 5 minutes.
|
| 606 |
+
[Input Data]
|
| 607 |
+
{repo_map_injection}
|
| 608 |
+
|
| 609 |
+
Files analyzed: {list(visited_files)}
|
| 610 |
+
|
| 611 |
+
[Code Knowledge & Key Snippets]
|
| 612 |
+
{enhanced_context}
|
| 613 |
+
|
| 614 |
+
[Strict Constraints]
|
| 615 |
+
1. **NO Code Review**: Do NOT list bugs, issues, missing features, or recommendations.
|
| 616 |
+
2. **NO Critique**: Do not judge the code quality. Focus on HOW it works.
|
| 617 |
+
3. **Tone**: Professional, descriptive, and structural.
|
| 618 |
+
4. **NO "FLUFF"**: Do NOT add unrequested sections like "Security", "Scalability", "Data Models", "Future Enhancements", etc.
|
| 619 |
+
|
| 620 |
+
[Required Output Format (Markdown)]
|
| 621 |
+
|
| 622 |
+
# Project Analysis Report
|
| 623 |
+
|
| 624 |
+
## 1. Executive Summary
|
| 625 |
+
- **Purpose**: (What specific problem does this project solve? 1-2 sentences)
|
| 626 |
+
- **Key Features**: (Bullet points of top 3 features)
|
| 627 |
+
- **Tech Stack**: (List languages, frameworks, databases, and key libs)
|
| 628 |
+
|
| 629 |
+
## 2. System Architecture
|
| 630 |
+
Create a `graph TD` diagram.
|
| 631 |
+
- Show high-level components (e.g., Client, API Server, Database, Worker, External Service).
|
| 632 |
+
- Label the edges with data flow (e.g., "HTTP", "SQL").
|
| 633 |
+
- **Style**: Keep it simple and conceptual. Limit to 8 nodes max.
|
| 634 |
+
|
| 635 |
+
**⚠️ Mermaid Syntax Rules (v10.x - MUST FOLLOW)**:
|
| 636 |
+
1. **Wrap ALL node text in double quotes**: `A["User Client"]` ✓, `A[User Client]` ✗
|
| 637 |
+
2. **Wrap ALL edge labels in double quotes**: `-->|"HTTP Request"|` ✓, `-->|HTTP Request|` ✗
|
| 638 |
+
3. **NO special characters in text**: Avoid `/`, `()`, `&`, `<>`, `<br/>` in labels
|
| 639 |
+
4. **Use short alphanumeric IDs**: e.g., `A`, `B`, `Client`, `API`, `DB`
|
| 640 |
+
5. **Subgraph titles need quotes**: `subgraph "Core Services"` ✓
|
| 641 |
+
6. **Database node format**: Use `[("Database")]` for cylinder shape
|
| 642 |
+
|
| 643 |
+
- **Correct Example**:
|
| 644 |
+
```mermaid
|
| 645 |
+
graph TD
|
| 646 |
+
Client["User Client"] -->|"HTTP Request"| API["API Gateway"]
|
| 647 |
+
API --> Service["Business Service"]
|
| 648 |
+
Service --> DB[("Database")]
|
| 649 |
+
Service -->|"Calls"| External["External API"]
|
| 650 |
+
```
|
| 651 |
+
|
| 652 |
+
## 3. Core Logic Analysis
|
| 653 |
+
(Create a Markdown Table to summarize key modules. Do not list every file, only the most important ones.)
|
| 654 |
+
|
| 655 |
+
| Component/File | Responsibility (What does it do?) | Key Design Pattern / Logic |
|
| 656 |
+
| :--- | :--- | :--- |
|
| 657 |
+
| e.g. `auth_service.py` | Handles JWT issuance and verification | Singleton, Decorator for routes |
|
| 658 |
+
| ... | ... | ... |
|
| 659 |
+
|
| 660 |
+
## 4. Core Methods Deep Dive
|
| 661 |
+
(Select the 3-5 most critical `.py` files. For each, list the top 2-3 methods that drive the logic.)
|
| 662 |
+
|
| 663 |
+
### 4.1 `[Filename, e.g., agent_service.py]`
|
| 664 |
+
* **`[Method Name]`**: [Explanation of what it does and why it matters. No code.]
|
| 665 |
+
* **`[Method Name]`**: [Explanation...]
|
| 666 |
+
|
| 667 |
+
### 4.2 `[Filename, e.g., vector_service.py]`
|
| 668 |
+
* **`[Method Name]`**: [Explanation...]
|
| 669 |
+
* ...
|
| 670 |
+
|
| 671 |
+
## 5. Main Workflow (Mermaid)
|
| 672 |
+
Select the **Single Most Important** business flow (The "Happy Path").
|
| 673 |
+
Create a `sequenceDiagram`.
|
| 674 |
+
- Participants should be high-level (e.g., User, API, DB), not specific variable names.
|
| 675 |
+
|
| 676 |
+
**⚠️ sequenceDiagram Syntax Rules**:
|
| 677 |
+
1. **Wrap participant aliases in quotes**: `participant API as "API Server"` ✓
|
| 678 |
+
2. **Wrap message text in quotes**: `User->>API: "Send Request"` ✓
|
| 679 |
+
3. **NO special characters**: Avoid `/`, `&`, `<`, `>` in messages
|
| 680 |
+
|
| 681 |
+
- **Correct Example**:
|
| 682 |
+
```mermaid
|
| 683 |
+
sequenceDiagram
|
| 684 |
+
participant User as "User"
|
| 685 |
+
participant API as "API Server"
|
| 686 |
+
participant DB as "Database"
|
| 687 |
+
User->>API: "Send Request"
|
| 688 |
+
API->>DB: "Query Data"
|
| 689 |
+
DB-->>API: "Return Result"
|
| 690 |
+
API-->>User: "Send Response"
|
| 691 |
+
```
|
| 692 |
+
|
| 693 |
+
## 6. Quick Start Guide
|
| 694 |
+
- **Prerequisites**: (e.g. Docker, Python 3.9+, .env file)
|
| 695 |
+
- **Entry Point**: (How to run the main logic? e.g. `python main.py` or `uvicorn`)
|
| 696 |
+
|
| 697 |
+
"""
|
| 698 |
+
|
| 699 |
+
# === 增加 timeout 防止长文本生成时断连 ===
|
| 700 |
+
report_messages = [
|
| 701 |
+
{"role": "system", "content": "You are a pragmatic Tech Lead. Focus on architecture and data flow, not implementation details."},
|
| 702 |
+
{"role": "user", "content": analysis_user_content}
|
| 703 |
+
]
|
| 704 |
+
|
| 705 |
+
stream_start_time = time.time()
|
| 706 |
+
stream = await client.chat.completions.create(
|
| 707 |
+
model=settings.default_model_name,
|
| 708 |
+
messages=report_messages,
|
| 709 |
+
stream=True,
|
| 710 |
+
timeout=settings.LLM_TIMEOUT # 使用统一配置
|
| 711 |
+
)
|
| 712 |
+
|
| 713 |
+
# === TTFT & Token Tracking ===
|
| 714 |
+
first_token_received = False
|
| 715 |
+
ttft_ms = None
|
| 716 |
+
generated_text = ""
|
| 717 |
+
completion_tokens_estimate = 0
|
| 718 |
+
|
| 719 |
+
# === 增加 try-except 捕获流式传输中断 ===
|
| 720 |
+
try:
|
| 721 |
+
async for chunk in stream:
|
| 722 |
+
if chunk.choices[0].delta.content:
|
| 723 |
+
content = chunk.choices[0].delta.content
|
| 724 |
+
|
| 725 |
+
# 记录 TTFT (首 Token 时间)
|
| 726 |
+
if not first_token_received:
|
| 727 |
+
ttft_ms = (time.time() - stream_start_time) * 1000
|
| 728 |
+
tracing_service.record_ttft(
|
| 729 |
+
ttft_ms=ttft_ms,
|
| 730 |
+
model=settings.default_model_name,
|
| 731 |
+
metadata={"step": "report_generation"}
|
| 732 |
+
)
|
| 733 |
+
first_token_received = True
|
| 734 |
+
|
| 735 |
+
generated_text += content
|
| 736 |
+
completion_tokens_estimate += 1 # 粗略估计每个 chunk 约 1 token
|
| 737 |
+
yield json.dumps({"step": "report_chunk", "chunk": content})
|
| 738 |
+
except (httpx.ReadError, httpx.ConnectError) as e:
|
| 739 |
+
yield json.dumps({"step": "error", "message": f"⚠️ Network Timeout during generation: {str(e)}"})
|
| 740 |
+
return
|
| 741 |
+
|
| 742 |
+
# 流结束后记录完整的 LLM 生成信息
|
| 743 |
+
total_latency_ms = (time.time() - stream_start_time) * 1000
|
| 744 |
+
tracing_service.record_llm_generation(
|
| 745 |
+
model=settings.default_model_name,
|
| 746 |
+
prompt_messages=report_messages,
|
| 747 |
+
generated_text=generated_text,
|
| 748 |
+
ttft_ms=ttft_ms,
|
| 749 |
+
total_latency_ms=total_latency_ms,
|
| 750 |
+
completion_tokens=completion_tokens_estimate,
|
| 751 |
+
is_streaming=True,
|
| 752 |
+
metadata={"step": "report_generation", "generated_chars": len(generated_text)}
|
| 753 |
+
)
|
| 754 |
+
|
| 755 |
+
# === 保存报告 (按语言存储,异步避免阻塞) ===
|
| 756 |
+
await vector_db.save_report(generated_text, language)
|
| 757 |
+
|
| 758 |
+
yield json.dumps({"step": "finish", "message": "✅ Analysis Complete!"})
|
| 759 |
+
|
| 760 |
+
except Exception as e:
|
| 761 |
+
# === 全局异常捕获 ===
|
| 762 |
+
import traceback
|
| 763 |
+
traceback.print_exc()
|
| 764 |
+
|
| 765 |
+
# 提取友好的错误信息
|
| 766 |
+
error_msg = str(e)
|
| 767 |
+
if "401" in error_msg:
|
| 768 |
+
ui_msg = "❌ GitHub Token Invalid. Please check your settings."
|
| 769 |
+
elif "403" in error_msg:
|
| 770 |
+
ui_msg = "❌ GitHub API Rate Limit Exceeded. Try again later or add a Token."
|
| 771 |
+
elif "404" in error_msg:
|
| 772 |
+
ui_msg = "❌ Repository Not Found. Check the URL."
|
| 773 |
+
elif "Timeout" in error_msg or "ConnectError" in error_msg:
|
| 774 |
+
ui_msg = "❌ Network Timeout. LLM or GitHub is not responding."
|
| 775 |
+
else:
|
| 776 |
+
ui_msg = f"💥 System Error: {error_msg}"
|
| 777 |
+
|
| 778 |
+
yield json.dumps({"step": "error", "message": ui_msg})
|
| 779 |
+
return # 终止流
|
app/services/auto_evaluation_service.py
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/services/auto_evaluation_service.py
|
| 2 |
+
"""
|
| 3 |
+
自动评估服务 - Phase 1
|
| 4 |
+
在后台异步进行评估,不阻塞用户请求
|
| 5 |
+
|
| 6 |
+
工作流程:
|
| 7 |
+
1. 用户调用 /chat 或 /analyze
|
| 8 |
+
2. 获得立即响应
|
| 9 |
+
3. 后台异步执行评估
|
| 10 |
+
4. 评估结果存储到 evaluation/sft_data/
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import asyncio
|
| 14 |
+
import json
|
| 15 |
+
import os
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
from typing import Optional
|
| 18 |
+
from dataclasses import dataclass
|
| 19 |
+
|
| 20 |
+
from evaluation.evaluation_framework import (
|
| 21 |
+
EvaluationEngine,
|
| 22 |
+
EvaluationResult,
|
| 23 |
+
DataRoutingEngine,
|
| 24 |
+
DataQualityTier
|
| 25 |
+
)
|
| 26 |
+
from evaluation.utils import is_chatty_query, has_code_indicators
|
| 27 |
+
from app.services.tracing_service import tracing_service
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class EvaluationConfig:
|
| 32 |
+
"""
|
| 33 |
+
自动评估配置
|
| 34 |
+
|
| 35 |
+
数据路由阈值说明(与 data_router.py 一致):
|
| 36 |
+
- score > 0.9 → Gold → positive_samples.jsonl
|
| 37 |
+
- score > 0.6 → Silver → positive_samples.jsonl
|
| 38 |
+
- score > 0.4 → Bronze → negative_samples.jsonl
|
| 39 |
+
- score <= 0.4 → Rejected → 不存储
|
| 40 |
+
"""
|
| 41 |
+
enabled: bool = True # 是否启用自动评估
|
| 42 |
+
use_ragas: bool = False # 是否使用 Ragas 进行 sanity check
|
| 43 |
+
custom_weight: float = 0.7 # custom_eval 的权重
|
| 44 |
+
ragas_weight: float = 0.3 # ragas_eval 的权重
|
| 45 |
+
diff_threshold: float = 0.2 # 差异阈值(超过则标记 needs_review)
|
| 46 |
+
min_quality_score: float = 0.4 # 最低质量分数(<=0.4 才拒绝)
|
| 47 |
+
async_evaluation: bool = True # 是否异步执行(推荐 True)
|
| 48 |
+
min_query_length: int = 10 # 最小 query 长度
|
| 49 |
+
min_answer_length: int = 100 # 最小 answer 长度
|
| 50 |
+
require_repo_url: bool = True # 是否要求有仓库 URL
|
| 51 |
+
require_code_in_context: bool = True # 是否要求上下文包含代码
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class AutoEvaluationService:
|
| 55 |
+
"""自动评估服务"""
|
| 56 |
+
|
| 57 |
+
def __init__(
|
| 58 |
+
self,
|
| 59 |
+
eval_engine: EvaluationEngine,
|
| 60 |
+
data_router: DataRoutingEngine,
|
| 61 |
+
config: EvaluationConfig = None
|
| 62 |
+
):
|
| 63 |
+
self.eval_engine = eval_engine
|
| 64 |
+
self.data_router = data_router
|
| 65 |
+
self.config = config or EvaluationConfig()
|
| 66 |
+
self.needs_review_queue: list = [] # 需要人工审查的样本队列
|
| 67 |
+
self._evaluated_keys: set = set() # 防重复评估(session_id:query_hash)
|
| 68 |
+
|
| 69 |
+
# 被过滤数据的记录文件
|
| 70 |
+
self.skipped_samples_file = "evaluation/sft_data/skipped_samples.jsonl"
|
| 71 |
+
os.makedirs(os.path.dirname(self.skipped_samples_file), exist_ok=True)
|
| 72 |
+
|
| 73 |
+
def _record_skipped(self, reason: str, query: str, session_id: str,
|
| 74 |
+
repo_url: str = "", context_len: int = 0, answer_len: int = 0) -> None:
|
| 75 |
+
"""记录被跳过的样本(供日后分析)"""
|
| 76 |
+
record = {
|
| 77 |
+
"timestamp": datetime.now().isoformat(),
|
| 78 |
+
"reason": reason,
|
| 79 |
+
"session_id": session_id,
|
| 80 |
+
"query": query[:200] if query else "",
|
| 81 |
+
"repo_url": repo_url,
|
| 82 |
+
"context_length": context_len,
|
| 83 |
+
"answer_length": answer_len
|
| 84 |
+
}
|
| 85 |
+
try:
|
| 86 |
+
with open(self.skipped_samples_file, 'a', encoding='utf-8') as f:
|
| 87 |
+
f.write(json.dumps(record, ensure_ascii=False) + '\n')
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f" ⚠️ 记录跳过样本失败: {e}")
|
| 90 |
+
|
| 91 |
+
def _validate_input(
|
| 92 |
+
self,
|
| 93 |
+
query: str,
|
| 94 |
+
retrieved_context: str,
|
| 95 |
+
generated_answer: str,
|
| 96 |
+
session_id: str,
|
| 97 |
+
repo_url: str
|
| 98 |
+
) -> tuple[bool, Optional[str]]:
|
| 99 |
+
"""
|
| 100 |
+
验证输入是否满足评估条件
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
(is_valid, skip_reason) - 如果有效返回 (True, None),否则返回 (False, reason)
|
| 104 |
+
"""
|
| 105 |
+
context_len = len(retrieved_context) if retrieved_context else 0
|
| 106 |
+
answer_len = len(generated_answer) if generated_answer else 0
|
| 107 |
+
|
| 108 |
+
# Query 验证
|
| 109 |
+
if not query or not query.strip():
|
| 110 |
+
self._record_skipped("query_empty", query or "", session_id, repo_url, context_len, answer_len)
|
| 111 |
+
return False, "query 为空"
|
| 112 |
+
|
| 113 |
+
if len(query.strip()) < self.config.min_query_length:
|
| 114 |
+
self._record_skipped("query_too_short", query, session_id, repo_url, context_len, answer_len)
|
| 115 |
+
return False, f"query 太短 ({len(query)} < {self.config.min_query_length})"
|
| 116 |
+
|
| 117 |
+
if is_chatty_query(query):
|
| 118 |
+
self._record_skipped("chatty_query", query, session_id, repo_url, context_len, answer_len)
|
| 119 |
+
return False, f"闲聊/无效 query: {query[:30]}"
|
| 120 |
+
|
| 121 |
+
# Repo URL 验证
|
| 122 |
+
if self.config.require_repo_url and not repo_url:
|
| 123 |
+
self._record_skipped("missing_repo_url", query, session_id, repo_url, context_len, answer_len)
|
| 124 |
+
return False, "缺少 repo_url"
|
| 125 |
+
|
| 126 |
+
# Answer 验证
|
| 127 |
+
if not generated_answer or len(generated_answer.strip()) < self.config.min_answer_length:
|
| 128 |
+
self._record_skipped("answer_too_short", query, session_id, repo_url, context_len, answer_len)
|
| 129 |
+
return False, f"回答太短 ({answer_len} < {self.config.min_answer_length})"
|
| 130 |
+
|
| 131 |
+
# Context 验证
|
| 132 |
+
if self.config.require_code_in_context and not has_code_indicators(retrieved_context):
|
| 133 |
+
self._record_skipped("no_code_in_context", query, session_id, repo_url, context_len, answer_len)
|
| 134 |
+
return False, "上下文中未检测到代码"
|
| 135 |
+
|
| 136 |
+
return True, None
|
| 137 |
+
|
| 138 |
+
def _check_duplicate(self, query: str, session_id: str) -> bool:
|
| 139 |
+
"""检查是否重复评估,返回 True 表示是重复的"""
|
| 140 |
+
import hashlib
|
| 141 |
+
query_hash = hashlib.md5(query.encode()).hexdigest()[:8]
|
| 142 |
+
eval_key = f"{session_id}:{query_hash}"
|
| 143 |
+
|
| 144 |
+
if eval_key in self._evaluated_keys:
|
| 145 |
+
return True
|
| 146 |
+
|
| 147 |
+
self._evaluated_keys.add(eval_key)
|
| 148 |
+
|
| 149 |
+
# 限制缓存大小,防止内存泄漏
|
| 150 |
+
if len(self._evaluated_keys) > 1000:
|
| 151 |
+
self._evaluated_keys = set(list(self._evaluated_keys)[-500:])
|
| 152 |
+
|
| 153 |
+
return False
|
| 154 |
+
|
| 155 |
+
async def auto_evaluate(
|
| 156 |
+
self,
|
| 157 |
+
query: str,
|
| 158 |
+
retrieved_context: str,
|
| 159 |
+
generated_answer: str,
|
| 160 |
+
session_id: str = "auto",
|
| 161 |
+
repo_url: str = "",
|
| 162 |
+
language: str = "en"
|
| 163 |
+
) -> Optional[str]:
|
| 164 |
+
"""
|
| 165 |
+
自动评估单个查询-回答对
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
质量等级 (gold/silver/bronze/rejected/needs_review) 或 None
|
| 169 |
+
"""
|
| 170 |
+
# 输入验证
|
| 171 |
+
is_valid, skip_reason = self._validate_input(
|
| 172 |
+
query, retrieved_context, generated_answer, session_id, repo_url
|
| 173 |
+
)
|
| 174 |
+
if not is_valid:
|
| 175 |
+
print(f" ⚠️ [AutoEval] 跳过: {skip_reason}")
|
| 176 |
+
return None
|
| 177 |
+
|
| 178 |
+
# 防重复评估
|
| 179 |
+
if self._check_duplicate(query, session_id):
|
| 180 |
+
print(f" ⏭️ [AutoEval] 跳过重复评估: {query[:30]}...")
|
| 181 |
+
return None
|
| 182 |
+
|
| 183 |
+
start_time = datetime.now()
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
# Step 1: 自定义评估
|
| 187 |
+
print(f"📊 [AutoEval] 开始评估: {query[:50]}...")
|
| 188 |
+
|
| 189 |
+
custom_metrics = await self.eval_engine.evaluate_generation(
|
| 190 |
+
query=query,
|
| 191 |
+
retrieved_context=retrieved_context,
|
| 192 |
+
generated_answer=generated_answer
|
| 193 |
+
)
|
| 194 |
+
custom_score = custom_metrics.overall_score()
|
| 195 |
+
|
| 196 |
+
print(f" ✓ Custom Score: {custom_score:.3f}")
|
| 197 |
+
print(f" - Faithfulness: {custom_metrics.faithfulness:.3f}")
|
| 198 |
+
print(f" - Answer Relevance: {custom_metrics.answer_relevance:.3f}")
|
| 199 |
+
print(f" - Completeness: {custom_metrics.answer_completeness:.3f}")
|
| 200 |
+
|
| 201 |
+
# Step 2: Ragas Sanity Check (如果启用)
|
| 202 |
+
ragas_score = None
|
| 203 |
+
ragas_details = None
|
| 204 |
+
|
| 205 |
+
if self.config.use_ragas:
|
| 206 |
+
try:
|
| 207 |
+
ragas_score, ragas_details = await self._ragas_eval(
|
| 208 |
+
query=query,
|
| 209 |
+
context=retrieved_context,
|
| 210 |
+
answer=generated_answer
|
| 211 |
+
)
|
| 212 |
+
print(f" ✓ Ragas Score: {ragas_score:.3f}")
|
| 213 |
+
if ragas_details:
|
| 214 |
+
print(f" - {ragas_details}")
|
| 215 |
+
except Exception as e:
|
| 216 |
+
print(f" ⚠️ Ragas 评估失败: {e}")
|
| 217 |
+
# Ragas 失败不应该中断主流程
|
| 218 |
+
|
| 219 |
+
# ============================================================
|
| 220 |
+
# Step 3: 混合评估 + 异常检测
|
| 221 |
+
# ============================================================
|
| 222 |
+
final_score, quality_status = self._compute_final_score(
|
| 223 |
+
custom_score=custom_score,
|
| 224 |
+
ragas_score=ragas_score
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
print(f" ✓ Final Score: {final_score:.3f} | Status: {quality_status}")
|
| 228 |
+
|
| 229 |
+
# ============================================================
|
| 230 |
+
# Step 4: 构建评估结果并存储
|
| 231 |
+
# ============================================================
|
| 232 |
+
eval_result = EvaluationResult(
|
| 233 |
+
session_id=session_id,
|
| 234 |
+
query=query,
|
| 235 |
+
repo_url=repo_url,
|
| 236 |
+
timestamp=start_time,
|
| 237 |
+
language=language,
|
| 238 |
+
generation_metrics=custom_metrics,
|
| 239 |
+
notes=f"ragas_score={ragas_score:.3f}" if ragas_score else ""
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
# 设置综合得分
|
| 243 |
+
eval_result.overall_score = final_score
|
| 244 |
+
|
| 245 |
+
# 根据状态和得分确定质量等级
|
| 246 |
+
print(f" [DEBUG] quality_status={quality_status}, final_score={final_score:.3f}, threshold={self.config.min_quality_score}")
|
| 247 |
+
|
| 248 |
+
if quality_status == "needs_review":
|
| 249 |
+
eval_result.data_quality_tier = DataQualityTier.BRONZE
|
| 250 |
+
eval_result.notes += " | needs_review=true"
|
| 251 |
+
# 加入审查队列
|
| 252 |
+
self.needs_review_queue.append({
|
| 253 |
+
"eval_result": eval_result,
|
| 254 |
+
"custom_score": custom_score,
|
| 255 |
+
"ragas_score": ragas_score,
|
| 256 |
+
"diff": abs(custom_score - (ragas_score or custom_score)),
|
| 257 |
+
"timestamp": start_time.isoformat()
|
| 258 |
+
})
|
| 259 |
+
print(f" ⚠️ 需要人工审查 (needs_review),暂存队列")
|
| 260 |
+
# 同时也路由到数据存储,便于后续分析
|
| 261 |
+
self.data_router.route_sample(eval_result)
|
| 262 |
+
elif final_score > self.config.min_quality_score:
|
| 263 |
+
# score > 0.4: 路由到 positive (>0.6) 或 negative (0.4-0.6)
|
| 264 |
+
print(f" ✓ 路由到 data_router (score {final_score:.2f} > {self.config.min_quality_score})")
|
| 265 |
+
self.data_router.route_sample(eval_result)
|
| 266 |
+
else:
|
| 267 |
+
# score <= 0.4: 质量太差,直接拒绝
|
| 268 |
+
eval_result.data_quality_tier = DataQualityTier.REJECTED
|
| 269 |
+
print(f" ❌ 评分过低 ({final_score:.2f} <= {self.config.min_quality_score}),拒绝存储")
|
| 270 |
+
|
| 271 |
+
# 记录到 tracing
|
| 272 |
+
tracing_service.add_event("auto_evaluation_completed", {
|
| 273 |
+
"query": query[:100],
|
| 274 |
+
"custom_score": custom_score,
|
| 275 |
+
"ragas_score": ragas_score,
|
| 276 |
+
"final_score": final_score,
|
| 277 |
+
"status": quality_status,
|
| 278 |
+
"quality_tier": eval_result.data_quality_tier.value
|
| 279 |
+
})
|
| 280 |
+
|
| 281 |
+
print(f" ✅ 评估完成\n")
|
| 282 |
+
|
| 283 |
+
return eval_result.data_quality_tier.value
|
| 284 |
+
|
| 285 |
+
except Exception as e:
|
| 286 |
+
print(f" ❌ 自动评估异常: {e}")
|
| 287 |
+
import traceback
|
| 288 |
+
traceback.print_exc()
|
| 289 |
+
return None
|
| 290 |
+
|
| 291 |
+
async def auto_evaluate_async(
|
| 292 |
+
self,
|
| 293 |
+
query: str,
|
| 294 |
+
retrieved_context: str,
|
| 295 |
+
generated_answer: str,
|
| 296 |
+
session_id: str = "auto",
|
| 297 |
+
repo_url: str = "",
|
| 298 |
+
language: str = "en"
|
| 299 |
+
) -> None:
|
| 300 |
+
"""
|
| 301 |
+
异步版本 - 不阻塞主流程
|
| 302 |
+
|
| 303 |
+
在后台执行评估,不等待结果
|
| 304 |
+
"""
|
| 305 |
+
if not self.config.async_evaluation:
|
| 306 |
+
# 同步模式(不推荐在生产环境)
|
| 307 |
+
await self.auto_evaluate(
|
| 308 |
+
query=query,
|
| 309 |
+
retrieved_context=retrieved_context,
|
| 310 |
+
generated_answer=generated_answer,
|
| 311 |
+
session_id=session_id,
|
| 312 |
+
repo_url=repo_url,
|
| 313 |
+
language=language
|
| 314 |
+
)
|
| 315 |
+
else:
|
| 316 |
+
# 异步模式(推荐)- 在后台执行
|
| 317 |
+
asyncio.create_task(
|
| 318 |
+
self._eval_task(
|
| 319 |
+
query=query,
|
| 320 |
+
retrieved_context=retrieved_context,
|
| 321 |
+
generated_answer=generated_answer,
|
| 322 |
+
session_id=session_id,
|
| 323 |
+
repo_url=repo_url,
|
| 324 |
+
language=language
|
| 325 |
+
)
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
async def _eval_task(
|
| 329 |
+
self,
|
| 330 |
+
query: str,
|
| 331 |
+
retrieved_context: str,
|
| 332 |
+
generated_answer: str,
|
| 333 |
+
session_id: str,
|
| 334 |
+
repo_url: str,
|
| 335 |
+
language: str
|
| 336 |
+
) -> None:
|
| 337 |
+
"""后台评估任务包装"""
|
| 338 |
+
try:
|
| 339 |
+
await asyncio.sleep(0.1) # 让用户请求先返回
|
| 340 |
+
await self.auto_evaluate(
|
| 341 |
+
query=query,
|
| 342 |
+
retrieved_context=retrieved_context,
|
| 343 |
+
generated_answer=generated_answer,
|
| 344 |
+
session_id=session_id,
|
| 345 |
+
repo_url=repo_url,
|
| 346 |
+
language=language
|
| 347 |
+
)
|
| 348 |
+
except Exception as e:
|
| 349 |
+
print(f"❌ Background eval task failed: {e}")
|
| 350 |
+
|
| 351 |
+
def _compute_final_score(
|
| 352 |
+
self,
|
| 353 |
+
custom_score: float,
|
| 354 |
+
ragas_score: Optional[float]
|
| 355 |
+
) -> tuple[float, str]:
|
| 356 |
+
"""
|
| 357 |
+
计算最终得分和状态
|
| 358 |
+
|
| 359 |
+
Returns:
|
| 360 |
+
(final_score, status)
|
| 361 |
+
status: "normal" / "needs_review" / "high_confidence"
|
| 362 |
+
"""
|
| 363 |
+
|
| 364 |
+
if ragas_score is None:
|
| 365 |
+
# 没有 Ragas 分数,直接用 custom 分数
|
| 366 |
+
return custom_score, "normal"
|
| 367 |
+
|
| 368 |
+
# 计算差异
|
| 369 |
+
diff = abs(custom_score - ragas_score)
|
| 370 |
+
|
| 371 |
+
# 判断异常
|
| 372 |
+
if diff > self.config.diff_threshold:
|
| 373 |
+
# 差异过大,标记为需要审查
|
| 374 |
+
return custom_score, "needs_review"
|
| 375 |
+
|
| 376 |
+
# 混合评分
|
| 377 |
+
final_score = (
|
| 378 |
+
self.config.custom_weight * custom_score +
|
| 379 |
+
self.config.ragas_weight * ragas_score
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
# 两者都高分 → 高置信度
|
| 383 |
+
if custom_score > 0.75 and ragas_score > 0.75:
|
| 384 |
+
status = "high_confidence"
|
| 385 |
+
else:
|
| 386 |
+
status = "normal"
|
| 387 |
+
|
| 388 |
+
return final_score, status
|
| 389 |
+
|
| 390 |
+
async def _ragas_eval(
|
| 391 |
+
self,
|
| 392 |
+
query: str,
|
| 393 |
+
context: str,
|
| 394 |
+
answer: str
|
| 395 |
+
) -> tuple[Optional[float], Optional[str]]:
|
| 396 |
+
"""
|
| 397 |
+
使用 Ragas 进行 sanity check
|
| 398 |
+
|
| 399 |
+
Returns:
|
| 400 |
+
(score, details)
|
| 401 |
+
"""
|
| 402 |
+
try:
|
| 403 |
+
from ragas.metrics import faithfulness, answer_relevancy
|
| 404 |
+
from ragas import evaluate
|
| 405 |
+
|
| 406 |
+
# 构造 Ragas 数据集
|
| 407 |
+
dataset_dict = {
|
| 408 |
+
"question": [query],
|
| 409 |
+
"contexts": [[context]],
|
| 410 |
+
"answer": [answer]
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
# 执行评估
|
| 414 |
+
result = evaluate(
|
| 415 |
+
dataset=dataset_dict,
|
| 416 |
+
metrics=[faithfulness, answer_relevancy]
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
# 提取分数
|
| 420 |
+
faithfulness_score = result["faithfulness"][0] if "faithfulness" in result else 0.5
|
| 421 |
+
relevancy_score = result["answer_relevancy"][0] if "answer_relevancy" in result else 0.5
|
| 422 |
+
|
| 423 |
+
# 平均得分
|
| 424 |
+
ragas_score = (faithfulness_score + relevancy_score) / 2
|
| 425 |
+
|
| 426 |
+
details = f"Ragas: faithfulness={faithfulness_score:.3f}, relevancy={relevancy_score:.3f}"
|
| 427 |
+
|
| 428 |
+
return ragas_score, details
|
| 429 |
+
|
| 430 |
+
except ImportError:
|
| 431 |
+
print("⚠️ Ragas 未安装,跳过 sanity check")
|
| 432 |
+
return None, None
|
| 433 |
+
except Exception as e:
|
| 434 |
+
print(f"⚠️ Ragas 评估异常: {e}")
|
| 435 |
+
return None, None
|
| 436 |
+
|
| 437 |
+
def get_review_queue(self) -> list:
|
| 438 |
+
"""获取需要审查的样本列表"""
|
| 439 |
+
return self.needs_review_queue
|
| 440 |
+
|
| 441 |
+
def clear_review_queue(self) -> None:
|
| 442 |
+
"""清空审查队列"""
|
| 443 |
+
self.needs_review_queue.clear()
|
| 444 |
+
|
| 445 |
+
def approve_sample(self, index: int) -> None:
|
| 446 |
+
"""人工批准某个样本"""
|
| 447 |
+
if 0 <= index < len(self.needs_review_queue):
|
| 448 |
+
item = self.needs_review_queue[index]
|
| 449 |
+
# 直接存储到评估结果
|
| 450 |
+
self.data_router.route_sample(item["eval_result"])
|
| 451 |
+
print(f"✅ 样本 {index} 已批准")
|
| 452 |
+
|
| 453 |
+
def reject_sample(self, index: int) -> None:
|
| 454 |
+
"""人工拒绝某个样本"""
|
| 455 |
+
if 0 <= index < len(self.needs_review_queue):
|
| 456 |
+
print(f"❌ 样本 {index} 已拒绝")
|
| 457 |
+
self.needs_review_queue.pop(index)
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
# 全局实例
|
| 461 |
+
auto_eval_service: Optional[AutoEvaluationService] = None
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
def init_auto_evaluation_service(
|
| 465 |
+
eval_engine: EvaluationEngine,
|
| 466 |
+
data_router: DataRoutingEngine,
|
| 467 |
+
config: EvaluationConfig = None
|
| 468 |
+
) -> AutoEvaluationService:
|
| 469 |
+
"""初始化自动评估服务"""
|
| 470 |
+
global auto_eval_service
|
| 471 |
+
auto_eval_service = AutoEvaluationService(
|
| 472 |
+
eval_engine=eval_engine,
|
| 473 |
+
data_router=data_router,
|
| 474 |
+
config=config
|
| 475 |
+
)
|
| 476 |
+
return auto_eval_service
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
def get_auto_evaluation_service() -> Optional[AutoEvaluationService]:
|
| 480 |
+
"""获取自动评估服务实例"""
|
| 481 |
+
return auto_eval_service
|
app/services/chat_service.py
ADDED
|
@@ -0,0 +1,601 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/services/chat_service.py
|
| 2 |
+
import json
|
| 3 |
+
import asyncio
|
| 4 |
+
import re
|
| 5 |
+
import time
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from typing import Dict, Optional, AsyncGenerator, List, Set
|
| 8 |
+
from app.core.config import settings
|
| 9 |
+
from app.utils.llm_client import client
|
| 10 |
+
from app.services.vector_service import store_manager
|
| 11 |
+
from app.services.github_service import get_file_content
|
| 12 |
+
from app.services.chunking_service import UniversalChunker, ChunkingConfig
|
| 13 |
+
from app.services.tracing_service import tracing_service
|
| 14 |
+
from app.utils.session import get_conversation_memory, ConversationMemory
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# ============================================================
|
| 18 |
+
# 配置类 - 解耦所有可调参数
|
| 19 |
+
# ============================================================
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class ChatConfig:
|
| 23 |
+
"""Chat 服务配置 - 集中管理所有参数"""
|
| 24 |
+
# JIT 动态加载配置
|
| 25 |
+
max_jit_rounds: int = 2 # 最大 JIT 轮数
|
| 26 |
+
max_files_per_round: int = 3 # 每轮最多加载文件数
|
| 27 |
+
|
| 28 |
+
# LLM 配置
|
| 29 |
+
temperature_thinking: float = 0.1 # 思考阶段温度
|
| 30 |
+
temperature_final: float = 0.2 # 最终回答温度
|
| 31 |
+
max_tokens: int = 4096 # 最大 token 数
|
| 32 |
+
|
| 33 |
+
# 检索配置
|
| 34 |
+
retrieval_top_k: int = 6 # RAG 检索 top-k
|
| 35 |
+
context_max_chars: int = 2000 # 单文档最大字符数
|
| 36 |
+
|
| 37 |
+
# 对话上下文配置
|
| 38 |
+
max_history_turns: int = 6 # 保留最近 N 轮对话
|
| 39 |
+
summary_threshold: int = 10 # 超过 N 轮开始压缩
|
| 40 |
+
|
| 41 |
+
# 调试配置
|
| 42 |
+
show_debug_info: bool = False # 是否显示调试信息
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# 全局配置实例
|
| 46 |
+
chat_config = ChatConfig()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class ChatResult:
|
| 51 |
+
"""聊天结果 - 用于后续自动评估"""
|
| 52 |
+
answer: str # 最终回答
|
| 53 |
+
retrieved_context: str # 检索到的上下文
|
| 54 |
+
generation_latency_ms: float # 生成耗时
|
| 55 |
+
retrieval_latency_ms: float = 0 # 检索耗时
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# === 评估数据存储 (供 main.py 获取) ===
|
| 59 |
+
# 存储每个 session 的评估数据,key 为 session_id
|
| 60 |
+
_eval_data_store: Dict[str, ChatResult] = {}
|
| 61 |
+
|
| 62 |
+
def get_eval_data(session_id: str) -> Optional[ChatResult]:
|
| 63 |
+
"""获取指定 session 的评估数据"""
|
| 64 |
+
return _eval_data_store.get(session_id)
|
| 65 |
+
|
| 66 |
+
def clear_eval_data(session_id: str) -> None:
|
| 67 |
+
"""清除指定 session 的评估数据"""
|
| 68 |
+
if session_id in _eval_data_store:
|
| 69 |
+
del _eval_data_store[session_id]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# [Fix 2] 使用 Config 对象初始化,而非直接传参
|
| 73 |
+
# 之前的写法: chunker = UniversalChunker(min_chunk_size=100)
|
| 74 |
+
# 现在的写法:
|
| 75 |
+
chunker = UniversalChunker(config=ChunkingConfig(min_chunk_size=100))
|
| 76 |
+
|
| 77 |
+
# === 新增:简单的中文检测 ===
|
| 78 |
+
def is_chinese_query(text: str) -> bool:
|
| 79 |
+
"""检测字符串中是否包含中文字符"""
|
| 80 |
+
for char in text:
|
| 81 |
+
if '\u4e00' <= char <= '\u9fff':
|
| 82 |
+
return True
|
| 83 |
+
return False
|
| 84 |
+
|
| 85 |
+
# === 优化 2:查询重写 (解决中英文检索不匹配问题) ===
|
| 86 |
+
async def _rewrite_query(user_query: str):
|
| 87 |
+
"""
|
| 88 |
+
使用 LLM 将用户的自然语言(可能是中文)转换为 3-5 个代码搜索关键词(英文)。
|
| 89 |
+
"""
|
| 90 |
+
prompt = f"""
|
| 91 |
+
You are a Code Search Expert.
|
| 92 |
+
Task: Convert the user's query into 3-5 English keywords for code search (BM25/Vector).
|
| 93 |
+
|
| 94 |
+
User Query: "{user_query}"
|
| 95 |
+
|
| 96 |
+
Rules:
|
| 97 |
+
1. Output ONLY a JSON list of strings.
|
| 98 |
+
2. Translate concepts to technical terms (e.g., "鉴权" -> "auth", "login", "middleware").
|
| 99 |
+
3. Keep it short.
|
| 100 |
+
|
| 101 |
+
Example Output: ["authentication", "login_handler", "jwt_verify"]
|
| 102 |
+
"""
|
| 103 |
+
try:
|
| 104 |
+
response = await client.chat.completions.create(
|
| 105 |
+
model=settings.default_model_name,
|
| 106 |
+
messages=[{"role": "user", "content": prompt}],
|
| 107 |
+
temperature=0.1,
|
| 108 |
+
max_tokens=100
|
| 109 |
+
)
|
| 110 |
+
content = response.choices[0].message.content
|
| 111 |
+
# 简单清洗
|
| 112 |
+
content = re.sub(r"^```(json)?|```$", "", content.strip(), flags=re.MULTILINE).strip()
|
| 113 |
+
keywords = json.loads(content)
|
| 114 |
+
if isinstance(keywords, list):
|
| 115 |
+
return " ".join(keywords) # 返回空格分隔的字符串供 BM25 使用
|
| 116 |
+
return user_query
|
| 117 |
+
except Exception as e:
|
| 118 |
+
print(f"⚠️ Query Rewrite Failed: {e}")
|
| 119 |
+
return user_query # 降级:直接用原句
|
| 120 |
+
|
| 121 |
+
async def process_chat_stream(user_query: str, session_id: str):
|
| 122 |
+
"""
|
| 123 |
+
处理聊天流 - 支持多轮 JIT 动态加载文件 + 对话上下文记忆
|
| 124 |
+
|
| 125 |
+
流程:
|
| 126 |
+
1. 获取对话记忆,构建上下文
|
| 127 |
+
2. 初始检索 RAG 上下文
|
| 128 |
+
3. LLM 思考并回答,可能请求文件
|
| 129 |
+
4. 如果请求文件,加载后继续对话 (最多 max_jit_rounds 轮)
|
| 130 |
+
5. 最终生成答案并保存到对话记忆
|
| 131 |
+
"""
|
| 132 |
+
vector_db = store_manager.get_store(session_id)
|
| 133 |
+
cfg = chat_config # 使用全局配置
|
| 134 |
+
|
| 135 |
+
# === 获取对话记忆 ===
|
| 136 |
+
memory = get_conversation_memory(session_id)
|
| 137 |
+
memory.add_user_message(user_query) # 立即记录用户消息
|
| 138 |
+
|
| 139 |
+
# 检查是否需要摘要压缩
|
| 140 |
+
if memory.needs_summarization():
|
| 141 |
+
yield "> 📝 *Compressing conversation history...*\n\n"
|
| 142 |
+
await _compress_conversation_history(memory)
|
| 143 |
+
|
| 144 |
+
# === 评估数据收集变量 ===
|
| 145 |
+
collected_context = ""
|
| 146 |
+
collected_response = ""
|
| 147 |
+
collected_retrieval_latency = 0.0
|
| 148 |
+
collected_generation_latency = 0.0
|
| 149 |
+
|
| 150 |
+
# === JIT 状态跟踪 ===
|
| 151 |
+
all_loaded_files: Set[str] = set() # 所有已加载的文件
|
| 152 |
+
all_failed_files: Set[str] = set() # 所有失败的文件
|
| 153 |
+
jit_round = 0 # 当前 JIT 轮数
|
| 154 |
+
|
| 155 |
+
# === 语言环境检测 ===
|
| 156 |
+
use_chinese = is_chinese_query(user_query)
|
| 157 |
+
|
| 158 |
+
# UI 提示语
|
| 159 |
+
ui_msgs = _get_ui_messages(use_chinese)
|
| 160 |
+
|
| 161 |
+
# === 步骤 0: 查询重写 ===
|
| 162 |
+
search_query = await _rewrite_query(user_query)
|
| 163 |
+
yield f"{ui_msgs['thinking']}`{search_query}`...\n\n"
|
| 164 |
+
|
| 165 |
+
# === 步骤 1: 初始 RAG 检索 ===
|
| 166 |
+
retrieval_start = time.time()
|
| 167 |
+
relevant_docs = await vector_db.search_hybrid(search_query, top_k=cfg.retrieval_top_k)
|
| 168 |
+
retrieval_latency_ms = (time.time() - retrieval_start) * 1000
|
| 169 |
+
collected_retrieval_latency = retrieval_latency_ms
|
| 170 |
+
tracing_service.add_event("retrieval_completed", {
|
| 171 |
+
"latency_ms": retrieval_latency_ms,
|
| 172 |
+
"documents_retrieved": len(relevant_docs) if relevant_docs else 0
|
| 173 |
+
})
|
| 174 |
+
|
| 175 |
+
rag_context = _build_context(relevant_docs, cfg.context_max_chars)
|
| 176 |
+
collected_context = rag_context
|
| 177 |
+
|
| 178 |
+
# === 步骤 2: 构建初始 Prompt ===
|
| 179 |
+
global_context = vector_db.global_context or {}
|
| 180 |
+
file_tree = global_context.get("file_tree", "(File tree not available.)")
|
| 181 |
+
agent_summary = global_context.get("summary", "")
|
| 182 |
+
|
| 183 |
+
# 获取对话历史上下文
|
| 184 |
+
conversation_context = _build_conversation_context(memory)
|
| 185 |
+
|
| 186 |
+
system_instruction = _build_system_prompt(
|
| 187 |
+
file_tree=file_tree,
|
| 188 |
+
agent_summary=agent_summary,
|
| 189 |
+
rag_context=rag_context,
|
| 190 |
+
use_chinese=use_chinese,
|
| 191 |
+
is_final_round=False,
|
| 192 |
+
conversation_context=conversation_context
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
augmented_user_query = f"""
|
| 196 |
+
{user_query}
|
| 197 |
+
|
| 198 |
+
(System Note: Priority 1: Answer using context. Priority 2: Use <tool_code> ONLY if critical info is missing.)
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
if not client:
|
| 202 |
+
yield "❌ LLM Error: Client not initialized"
|
| 203 |
+
return
|
| 204 |
+
|
| 205 |
+
# 初始化对话历史
|
| 206 |
+
messages = [
|
| 207 |
+
{"role": "system", "content": system_instruction},
|
| 208 |
+
{"role": "user", "content": augmented_user_query}
|
| 209 |
+
]
|
| 210 |
+
|
| 211 |
+
try:
|
| 212 |
+
generation_start = time.time()
|
| 213 |
+
|
| 214 |
+
# === 多轮 JIT 循环 ===
|
| 215 |
+
while jit_round <= cfg.max_jit_rounds:
|
| 216 |
+
is_final_round = (jit_round == cfg.max_jit_rounds)
|
| 217 |
+
|
| 218 |
+
# 如果是最终轮,更新系统提示禁用工具
|
| 219 |
+
if is_final_round and jit_round > 0:
|
| 220 |
+
# 更新系统消息,告知这是最后一轮
|
| 221 |
+
messages[0] = {"role": "system", "content": _build_system_prompt(
|
| 222 |
+
file_tree=file_tree,
|
| 223 |
+
agent_summary=agent_summary,
|
| 224 |
+
rag_context=collected_context,
|
| 225 |
+
use_chinese=use_chinese,
|
| 226 |
+
is_final_round=True,
|
| 227 |
+
failed_files=list(all_failed_files)
|
| 228 |
+
)}
|
| 229 |
+
|
| 230 |
+
# LLM 流式生成
|
| 231 |
+
stream = await client.chat.completions.create(
|
| 232 |
+
model=settings.default_model_name,
|
| 233 |
+
messages=messages,
|
| 234 |
+
stream=True,
|
| 235 |
+
temperature=cfg.temperature_final if is_final_round else cfg.temperature_thinking,
|
| 236 |
+
max_tokens=cfg.max_tokens
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
buffer = ""
|
| 240 |
+
round_response = ""
|
| 241 |
+
requested_files: Set[str] = set()
|
| 242 |
+
|
| 243 |
+
async for chunk in stream:
|
| 244 |
+
content = chunk.choices[0].delta.content or ""
|
| 245 |
+
if not content:
|
| 246 |
+
continue
|
| 247 |
+
|
| 248 |
+
buffer += content
|
| 249 |
+
round_response += content
|
| 250 |
+
collected_response += content
|
| 251 |
+
|
| 252 |
+
# 检测 tool_code 标签
|
| 253 |
+
if "</tool_code>" in buffer:
|
| 254 |
+
matches = re.findall(r"<tool_code>\s*(.*?)\s*</tool_code>", buffer, re.DOTALL)
|
| 255 |
+
for f in matches:
|
| 256 |
+
clean_f = f.strip().replace("'", "").replace('"', "").replace("`", "")
|
| 257 |
+
# 过滤已加载和已失败的文件
|
| 258 |
+
if clean_f and clean_f not in all_loaded_files and clean_f not in all_failed_files:
|
| 259 |
+
requested_files.add(clean_f)
|
| 260 |
+
yield content
|
| 261 |
+
buffer = ""
|
| 262 |
+
else:
|
| 263 |
+
yield content
|
| 264 |
+
|
| 265 |
+
# 处理缓冲区残留
|
| 266 |
+
if "</tool_code>" in buffer:
|
| 267 |
+
matches = re.findall(r"<tool_code>\s*(.*?)\s*</tool_code>", buffer, re.DOTALL)
|
| 268 |
+
for f in matches:
|
| 269 |
+
clean_f = f.strip().replace("'", "").replace('"', "").replace("`", "")
|
| 270 |
+
if clean_f and clean_f not in all_loaded_files and clean_f not in all_failed_files:
|
| 271 |
+
requested_files.add(clean_f)
|
| 272 |
+
|
| 273 |
+
# === 判断是否需要继续 JIT ===
|
| 274 |
+
if not requested_files or is_final_round:
|
| 275 |
+
# 没有新文件请求,或已达最大轮数,结束循环
|
| 276 |
+
break
|
| 277 |
+
|
| 278 |
+
# === JIT 文件加载 ===
|
| 279 |
+
jit_round += 1
|
| 280 |
+
|
| 281 |
+
# 限制每轮文件数
|
| 282 |
+
files_to_load = list(requested_files)[:cfg.max_files_per_round]
|
| 283 |
+
file_list_str = ", ".join([f"`{f}`" for f in files_to_load])
|
| 284 |
+
|
| 285 |
+
yield f"\n\n> 🔍 **[JIT Round {jit_round}/{cfg.max_jit_rounds}]** {ui_msgs['action_short']}{file_list_str}...\n\n"
|
| 286 |
+
|
| 287 |
+
if not vector_db.repo_url:
|
| 288 |
+
yield ui_msgs['error_url']
|
| 289 |
+
break
|
| 290 |
+
|
| 291 |
+
# 加载文件
|
| 292 |
+
round_loaded_docs = []
|
| 293 |
+
round_failed_files = []
|
| 294 |
+
|
| 295 |
+
for file_path in files_to_load:
|
| 296 |
+
if file_path in vector_db.indexed_files:
|
| 297 |
+
docs = vector_db.get_documents_by_file(file_path)
|
| 298 |
+
round_loaded_docs.extend(docs)
|
| 299 |
+
all_loaded_files.add(file_path)
|
| 300 |
+
yield f"> ✅ Loaded: `{file_path}`\n"
|
| 301 |
+
else:
|
| 302 |
+
success = await _download_and_index(vector_db, file_path)
|
| 303 |
+
if success:
|
| 304 |
+
docs = vector_db.get_documents_by_file(file_path)
|
| 305 |
+
round_loaded_docs.extend(docs)
|
| 306 |
+
all_loaded_files.add(file_path)
|
| 307 |
+
yield f"> ✅ Downloaded: `{file_path}`\n"
|
| 308 |
+
else:
|
| 309 |
+
round_failed_files.append(file_path)
|
| 310 |
+
all_failed_files.add(file_path)
|
| 311 |
+
yield f"> ⚠️ Failed: `{file_path}`\n"
|
| 312 |
+
|
| 313 |
+
# 构建后续消息
|
| 314 |
+
if round_loaded_docs:
|
| 315 |
+
new_context = _build_context(round_loaded_docs, cfg.context_max_chars)
|
| 316 |
+
collected_context += f"\n\n[JIT Round {jit_round} Context]\n{new_context}"
|
| 317 |
+
|
| 318 |
+
# 构建状态消息
|
| 319 |
+
status_msg = _build_jit_status_message(
|
| 320 |
+
loaded_count=len(round_loaded_docs),
|
| 321 |
+
failed_files=round_failed_files,
|
| 322 |
+
remaining_rounds=cfg.max_jit_rounds - jit_round,
|
| 323 |
+
use_chinese=use_chinese
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
context_section = f"\n\n[New Code Context]\n{_build_context(round_loaded_docs, cfg.context_max_chars)}" if round_loaded_docs else ""
|
| 327 |
+
|
| 328 |
+
# 更新对话历史,继续对话
|
| 329 |
+
messages.append({"role": "assistant", "content": round_response})
|
| 330 |
+
messages.append({"role": "user", "content": f"{status_msg}{context_section}\n\nPlease continue your analysis."})
|
| 331 |
+
|
| 332 |
+
yield "\n\n" # 分隔符
|
| 333 |
+
|
| 334 |
+
# === 生成完成 ===
|
| 335 |
+
generation_latency_ms = (time.time() - generation_start) * 1000
|
| 336 |
+
collected_generation_latency = generation_latency_ms
|
| 337 |
+
|
| 338 |
+
tracing_service.add_event("generation_completed", {
|
| 339 |
+
"latency_ms": generation_latency_ms,
|
| 340 |
+
"jit_rounds": jit_round,
|
| 341 |
+
"files_loaded": len(all_loaded_files),
|
| 342 |
+
"files_failed": len(all_failed_files)
|
| 343 |
+
})
|
| 344 |
+
|
| 345 |
+
# === 保存助手回复到对话记忆 ===
|
| 346 |
+
memory.add_assistant_message(collected_response)
|
| 347 |
+
|
| 348 |
+
# 存储评估数据
|
| 349 |
+
_eval_data_store[session_id] = ChatResult(
|
| 350 |
+
answer=collected_response,
|
| 351 |
+
retrieved_context=collected_context,
|
| 352 |
+
generation_latency_ms=collected_generation_latency,
|
| 353 |
+
retrieval_latency_ms=collected_retrieval_latency
|
| 354 |
+
)
|
| 355 |
+
print(f"📦 [EvalData] Session {session_id}: {len(collected_context)} chars context, {len(collected_response)} chars answer, {jit_round} JIT rounds, {memory.get_turn_count()} turns")
|
| 356 |
+
|
| 357 |
+
except Exception as e:
|
| 358 |
+
import traceback
|
| 359 |
+
traceback.print_exc()
|
| 360 |
+
error_msg = str(e)
|
| 361 |
+
# 即使出错也保存部分回复
|
| 362 |
+
if collected_response:
|
| 363 |
+
memory.add_assistant_message(collected_response + f"\n\n[Error: {error_msg}]")
|
| 364 |
+
tracing_service.add_event("generation_error", {
|
| 365 |
+
"error": error_msg,
|
| 366 |
+
"error_type": type(e).__name__,
|
| 367 |
+
"jit_round": jit_round
|
| 368 |
+
})
|
| 369 |
+
yield f"\n\n❌ System Error: {error_msg}"
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# ============================================================
|
| 373 |
+
# 辅助函数
|
| 374 |
+
# ============================================================
|
| 375 |
+
|
| 376 |
+
def _get_ui_messages(use_chinese: bool) -> Dict[str, str]:
|
| 377 |
+
"""获取 UI 消息(根据语言)"""
|
| 378 |
+
if use_chinese:
|
| 379 |
+
return {
|
| 380 |
+
"thinking": "> 🧠 **思考中:** 正在检索相关代码: ",
|
| 381 |
+
"action_short": "正在读取文件: ",
|
| 382 |
+
"error_url": "> ⚠️ 错误: 仓库链接丢失。\n",
|
| 383 |
+
}
|
| 384 |
+
else:
|
| 385 |
+
return {
|
| 386 |
+
"thinking": "> 🧠 **Thinking:** Searching for code related to: ",
|
| 387 |
+
"action_short": "Retrieving files: ",
|
| 388 |
+
"error_url": "> ⚠️ Error: Repository URL lost.\n",
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def _build_system_prompt(
|
| 393 |
+
file_tree: str,
|
| 394 |
+
agent_summary: str,
|
| 395 |
+
rag_context: str,
|
| 396 |
+
use_chinese: bool,
|
| 397 |
+
is_final_round: bool,
|
| 398 |
+
failed_files: List[str] = None,
|
| 399 |
+
conversation_context: str = ""
|
| 400 |
+
) -> str:
|
| 401 |
+
"""构建系统提示词"""
|
| 402 |
+
lang_instruction = (
|
| 403 |
+
"IMPORTANT: The user is asking in Chinese. You MUST reply in Simplified Chinese (简体中文)."
|
| 404 |
+
if use_chinese else "Reply in English."
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
if is_final_round:
|
| 408 |
+
tool_instruction = """
|
| 409 |
+
[INSTRUCTIONS - FINAL ROUND]
|
| 410 |
+
This is your FINAL response. You MUST provide a complete answer NOW.
|
| 411 |
+
- DO NOT request any more files
|
| 412 |
+
- DO NOT use <tool_code> tags
|
| 413 |
+
- Synthesize all available context and give your best answer
|
| 414 |
+
- If some files were not accessible, explain what information is missing and provide the best possible answer with what you have
|
| 415 |
+
"""
|
| 416 |
+
if failed_files:
|
| 417 |
+
tool_instruction += f"\n Note: The following files could not be accessed: {', '.join(failed_files)}"
|
| 418 |
+
else:
|
| 419 |
+
tool_instruction = """
|
| 420 |
+
[INSTRUCTIONS]
|
| 421 |
+
1. **CHECK CONTEXT FIRST**: Look at the [Current Code Context]. Does it contain the answer?
|
| 422 |
+
2. **IF YES**: Answer directly. DO NOT use tools.
|
| 423 |
+
3. **IF NO**: Request missing files using tags: <tool_code>path/to/file</tool_code>
|
| 424 |
+
"""
|
| 425 |
+
|
| 426 |
+
# 添加对话历史上下文
|
| 427 |
+
conversation_section = ""
|
| 428 |
+
if conversation_context:
|
| 429 |
+
conversation_section = f"""
|
| 430 |
+
[Previous Conversation]
|
| 431 |
+
{conversation_context}
|
| 432 |
+
"""
|
| 433 |
+
|
| 434 |
+
return f"""
|
| 435 |
+
You are a Senior GitHub Repository Analyst.
|
| 436 |
+
{lang_instruction}
|
| 437 |
+
|
| 438 |
+
[Global Context - Repo Map]
|
| 439 |
+
{file_tree}
|
| 440 |
+
|
| 441 |
+
[Agent Analysis Summary]
|
| 442 |
+
{agent_summary}
|
| 443 |
+
{conversation_section}
|
| 444 |
+
[Current Code Context (Retrieved)]
|
| 445 |
+
{rag_context}
|
| 446 |
+
{tool_instruction}
|
| 447 |
+
"""
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def _build_conversation_context(memory: ConversationMemory) -> str:
|
| 451 |
+
"""
|
| 452 |
+
构建对话历史上下文字符串
|
| 453 |
+
|
| 454 |
+
只包含最近几轮对话的摘要,用于 system prompt
|
| 455 |
+
"""
|
| 456 |
+
messages = memory.get_context_messages()
|
| 457 |
+
|
| 458 |
+
if len(messages) <= 2:
|
| 459 |
+
# 只有当前轮,不需要历史
|
| 460 |
+
return ""
|
| 461 |
+
|
| 462 |
+
# 排除最后一条(当前用户消息)
|
| 463 |
+
history_messages = messages[:-1]
|
| 464 |
+
|
| 465 |
+
if not history_messages:
|
| 466 |
+
return ""
|
| 467 |
+
|
| 468 |
+
context_parts = []
|
| 469 |
+
for msg in history_messages[-6:]: # 最多 6 条(3 轮)
|
| 470 |
+
role = "User" if msg["role"] == "user" else "Assistant"
|
| 471 |
+
# 截断过长的内容
|
| 472 |
+
content = msg["content"][:500]
|
| 473 |
+
if len(msg["content"]) > 500:
|
| 474 |
+
content += "..."
|
| 475 |
+
context_parts.append(f"{role}: {content}")
|
| 476 |
+
|
| 477 |
+
return "\n".join(context_parts)
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
async def _compress_conversation_history(memory: ConversationMemory) -> None:
|
| 481 |
+
"""
|
| 482 |
+
压缩对话历史 - 使用 LLM 生成摘要
|
| 483 |
+
"""
|
| 484 |
+
messages_to_summarize = memory.get_messages_to_summarize()
|
| 485 |
+
|
| 486 |
+
if not messages_to_summarize:
|
| 487 |
+
return
|
| 488 |
+
|
| 489 |
+
# 构建摘要请求
|
| 490 |
+
conversation_text = "\n".join([
|
| 491 |
+
f"{'User' if m['role'] == 'user' else 'Assistant'}: {m['content'][:300]}"
|
| 492 |
+
for m in messages_to_summarize
|
| 493 |
+
])
|
| 494 |
+
|
| 495 |
+
prompt = f"""Summarize the following conversation in 2-3 sentences, focusing on:
|
| 496 |
+
1. What questions were asked
|
| 497 |
+
2. Key information discovered
|
| 498 |
+
3. Important conclusions
|
| 499 |
+
|
| 500 |
+
Conversation:
|
| 501 |
+
{conversation_text}
|
| 502 |
+
|
| 503 |
+
Summary (be concise):"""
|
| 504 |
+
|
| 505 |
+
try:
|
| 506 |
+
response = await client.chat.completions.create(
|
| 507 |
+
model=settings.default_model_name,
|
| 508 |
+
messages=[{"role": "user", "content": prompt}],
|
| 509 |
+
temperature=0.3,
|
| 510 |
+
max_tokens=200
|
| 511 |
+
)
|
| 512 |
+
summary = response.choices[0].message.content.strip()
|
| 513 |
+
|
| 514 |
+
# 保存摘要
|
| 515 |
+
end_idx = len(memory._messages) - chat_config.max_history_turns * 2
|
| 516 |
+
memory.set_summary(summary, end_idx)
|
| 517 |
+
|
| 518 |
+
print(f"📝 Conversation compressed: {len(messages_to_summarize)} messages -> summary")
|
| 519 |
+
except Exception as e:
|
| 520 |
+
print(f"⚠️ Failed to compress conversation: {e}")
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
def _build_jit_status_message(
|
| 524 |
+
loaded_count: int,
|
| 525 |
+
failed_files: List[str],
|
| 526 |
+
remaining_rounds: int,
|
| 527 |
+
use_chinese: bool
|
| 528 |
+
) -> str:
|
| 529 |
+
"""构建 JIT 状态消息"""
|
| 530 |
+
if use_chinese:
|
| 531 |
+
if loaded_count > 0 and not failed_files:
|
| 532 |
+
return f"系统通知: 成功加载 {loaded_count} 个文件。"
|
| 533 |
+
elif loaded_count > 0 and failed_files:
|
| 534 |
+
failed_list = ", ".join(failed_files)
|
| 535 |
+
return f"系统通知: 加载了 {loaded_count} 个文件,但以下文件无法访问: {failed_list}。"
|
| 536 |
+
else:
|
| 537 |
+
failed_list = ", ".join(failed_files)
|
| 538 |
+
if remaining_rounds > 0:
|
| 539 |
+
return f"系统通知: 文件 ({failed_list}) 无法访问。你还有 {remaining_rounds} 次机会请求其他文件,或者基于现有上下文回答。"
|
| 540 |
+
else:
|
| 541 |
+
return f"系统通知: 文件 ({failed_list}) 无法访问。请基于现有上下文给出最佳回答。"
|
| 542 |
+
else:
|
| 543 |
+
if loaded_count > 0 and not failed_files:
|
| 544 |
+
return f"System Notification: Successfully loaded {loaded_count} files."
|
| 545 |
+
elif loaded_count > 0 and failed_files:
|
| 546 |
+
failed_list = ", ".join(failed_files)
|
| 547 |
+
return f"System Notification: Loaded {loaded_count} files, but the following could not be accessed: {failed_list}."
|
| 548 |
+
else:
|
| 549 |
+
failed_list = ", ".join(failed_files)
|
| 550 |
+
if remaining_rounds > 0:
|
| 551 |
+
return f"System Notification: Files ({failed_list}) could not be accessed. You have {remaining_rounds} more attempts to request other files, or answer based on available context."
|
| 552 |
+
else:
|
| 553 |
+
return f"System Notification: Files ({failed_list}) could not be accessed. Please provide the best possible answer based on existing context."
|
| 554 |
+
|
| 555 |
+
async def _download_and_index(vector_db, file_path):
|
| 556 |
+
"""下载并索引文件"""
|
| 557 |
+
try:
|
| 558 |
+
content = await get_file_content(vector_db.repo_url, file_path)
|
| 559 |
+
if not content: return False
|
| 560 |
+
|
| 561 |
+
chunks = await asyncio.to_thread(chunker.chunk_file, content, file_path)
|
| 562 |
+
if not chunks:
|
| 563 |
+
chunks = [{
|
| 564 |
+
"content": content,
|
| 565 |
+
"metadata": {"file": file_path, "type": "text", "name": "root", "class": ""}
|
| 566 |
+
}]
|
| 567 |
+
|
| 568 |
+
documents = [c["content"] for c in chunks]
|
| 569 |
+
metadatas = []
|
| 570 |
+
for c in chunks:
|
| 571 |
+
meta = c["metadata"]
|
| 572 |
+
metadatas.append({
|
| 573 |
+
"file": meta["file"],
|
| 574 |
+
"type": meta["type"],
|
| 575 |
+
"name": meta.get("name", ""),
|
| 576 |
+
"class": meta.get("class") or ""
|
| 577 |
+
})
|
| 578 |
+
await vector_db.add_documents(documents, metadatas)
|
| 579 |
+
return True
|
| 580 |
+
except Exception as e:
|
| 581 |
+
print(f"Download Error: {e}")
|
| 582 |
+
return False
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
def _build_context(docs: List[Dict], max_chars: int = 2000) -> str:
|
| 586 |
+
"""构建上下文字符串"""
|
| 587 |
+
if not docs:
|
| 588 |
+
return "(No relevant code snippets found yet)"
|
| 589 |
+
|
| 590 |
+
context = ""
|
| 591 |
+
for doc in docs:
|
| 592 |
+
file_info = doc.get('file', 'unknown')
|
| 593 |
+
metadata = doc.get('metadata', {})
|
| 594 |
+
|
| 595 |
+
if 'class' in metadata and metadata['class']:
|
| 596 |
+
file_info += f" (Class: {metadata['class']})"
|
| 597 |
+
|
| 598 |
+
content = doc.get('content', '')[:max_chars]
|
| 599 |
+
context += f"\n--- File: {file_info} ---\n{content}\n"
|
| 600 |
+
|
| 601 |
+
return context
|
app/services/chunking_service.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast
|
| 2 |
+
import re
|
| 3 |
+
import os
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
|
| 6 |
+
# --- 配置类 ---
|
| 7 |
+
@dataclass
|
| 8 |
+
class ChunkingConfig:
|
| 9 |
+
"""
|
| 10 |
+
统一管理切分服务的配置参数
|
| 11 |
+
"""
|
| 12 |
+
min_chunk_size: int = 50 # 最小分块阈值 (chars)
|
| 13 |
+
max_chunk_size: int = 2000 # 最大分块阈值 (chars)
|
| 14 |
+
fallback_line_size: int = 100 # 兜底策略的行数 (lines)
|
| 15 |
+
max_context_chars: int = 500 # 允许注入到每个Chunk的上下文最大长度
|
| 16 |
+
# 超过此长度则不再注入,避免冗余内容撑爆 Token
|
| 17 |
+
|
| 18 |
+
class UniversalChunker:
|
| 19 |
+
def __init__(self, config: ChunkingConfig = None):
|
| 20 |
+
# 如果未传入配置,使用默认配置
|
| 21 |
+
self.config = config if config else ChunkingConfig()
|
| 22 |
+
|
| 23 |
+
def chunk_file(self, content: str, file_path: str):
|
| 24 |
+
if not content:
|
| 25 |
+
return []
|
| 26 |
+
|
| 27 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 28 |
+
|
| 29 |
+
if ext == '.py':
|
| 30 |
+
return self._chunk_python(content, file_path)
|
| 31 |
+
|
| 32 |
+
# 2. C-Style 语言优化
|
| 33 |
+
elif ext in ['.java', '.js', '.ts', '.jsx', '.tsx', '.go', '.cpp', '.c', '.h', '.cs', '.php', '.rs']:
|
| 34 |
+
return self._chunk_c_style(content, file_path)
|
| 35 |
+
|
| 36 |
+
else:
|
| 37 |
+
return self._fallback_chunking(content, file_path)
|
| 38 |
+
|
| 39 |
+
def _chunk_python(self, content, file_path):
|
| 40 |
+
"""
|
| 41 |
+
分级注入策略
|
| 42 |
+
"""
|
| 43 |
+
chunks = []
|
| 44 |
+
try:
|
| 45 |
+
tree = ast.parse(content)
|
| 46 |
+
except SyntaxError:
|
| 47 |
+
return self._fallback_chunking(content, file_path)
|
| 48 |
+
|
| 49 |
+
import_nodes = []
|
| 50 |
+
other_nodes = []
|
| 51 |
+
function_class_chunks = []
|
| 52 |
+
|
| 53 |
+
# A. 遍历与分类
|
| 54 |
+
for node in tree.body:
|
| 55 |
+
if isinstance(node, ast.ClassDef):
|
| 56 |
+
class_code = ast.get_source_segment(content, node)
|
| 57 |
+
if not class_code: continue
|
| 58 |
+
if len(class_code) <= self.config.max_chunk_size:
|
| 59 |
+
function_class_chunks.append(self._create_chunk(
|
| 60 |
+
class_code, file_path, "class", node.name, node.lineno, node.name
|
| 61 |
+
))
|
| 62 |
+
else:
|
| 63 |
+
# function_class_chunks 包含了从大类中拆分出的方法
|
| 64 |
+
function_class_chunks.extend(
|
| 65 |
+
self._chunk_large_python_class(node, content, file_path)
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 69 |
+
func_code = ast.get_source_segment(content, node)
|
| 70 |
+
if func_code and len(func_code) >= self.config.min_chunk_size:
|
| 71 |
+
function_class_chunks.append(self._create_chunk(
|
| 72 |
+
func_code, file_path, "function", node.name, node.lineno
|
| 73 |
+
))
|
| 74 |
+
|
| 75 |
+
else:
|
| 76 |
+
segment = ast.get_source_segment(content, node)
|
| 77 |
+
if segment and len(segment.strip()) > 0:
|
| 78 |
+
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
| 79 |
+
import_nodes.append(segment)
|
| 80 |
+
else:
|
| 81 |
+
other_nodes.append(segment)
|
| 82 |
+
|
| 83 |
+
# B. 决策准备
|
| 84 |
+
has_core_code = len(function_class_chunks) > 0
|
| 85 |
+
others_text = "\n".join(other_nodes).strip()
|
| 86 |
+
should_inject_others = len(others_text) <= self.config.max_context_chars
|
| 87 |
+
|
| 88 |
+
# C. 构建 Context Header
|
| 89 |
+
context_parts = []
|
| 90 |
+
# 1. Import 永远注入
|
| 91 |
+
if import_nodes:
|
| 92 |
+
context_parts.append("\n".join(import_nodes))
|
| 93 |
+
# 2. Globals 按需注入
|
| 94 |
+
if others_text and should_inject_others:
|
| 95 |
+
context_parts.append(others_text)
|
| 96 |
+
|
| 97 |
+
full_header = "\n".join(context_parts).strip()
|
| 98 |
+
if full_header:
|
| 99 |
+
full_header = f"# --- Context ---\n{full_header}\n# ---------------\n"
|
| 100 |
+
|
| 101 |
+
# D. 注入 Header 到核心 Chunk (函数/类)
|
| 102 |
+
# 此时 function_class_chunks 已经包含了大类拆分出来的方法
|
| 103 |
+
# 这里的循环会给它们都加上 Import/Global Context
|
| 104 |
+
for chunk in function_class_chunks:
|
| 105 |
+
chunk["content"] = full_header + chunk["content"]
|
| 106 |
+
|
| 107 |
+
# E. 处理溢出 (仅当有核心代码时,才独立存储溢出的 Globals)
|
| 108 |
+
if has_core_code and others_text and not should_inject_others:
|
| 109 |
+
chunks.append(self._create_chunk(
|
| 110 |
+
others_text, file_path, "global_context", "globals", 1
|
| 111 |
+
))
|
| 112 |
+
|
| 113 |
+
# F. 纯脚本兜底
|
| 114 |
+
if not has_core_code:
|
| 115 |
+
# 这是一个纯脚本文件 (只有 Import 和 顶层逻辑)
|
| 116 |
+
full_script = (("\n".join(import_nodes) + "\n") if import_nodes else "") + others_text
|
| 117 |
+
if full_script.strip():
|
| 118 |
+
# 如果脚本太长,不要硬切成一个大块,而是走 Fallback 按行切分
|
| 119 |
+
if len(full_script) > self.config.max_chunk_size * 1.5: # 1.5倍宽容度
|
| 120 |
+
return self._fallback_chunking(content, file_path)
|
| 121 |
+
else:
|
| 122 |
+
chunks.append(self._create_chunk(
|
| 123 |
+
full_script, file_path, "script", "main", 1
|
| 124 |
+
))
|
| 125 |
+
|
| 126 |
+
chunks.extend(function_class_chunks)
|
| 127 |
+
|
| 128 |
+
if not chunks and len(content.strip()) > 0:
|
| 129 |
+
return self._fallback_chunking(content, file_path)
|
| 130 |
+
|
| 131 |
+
return chunks
|
| 132 |
+
|
| 133 |
+
def _chunk_large_python_class(self, class_node, content, file_path):
|
| 134 |
+
chunks = []
|
| 135 |
+
class_name = class_node.name
|
| 136 |
+
docstring = ast.get_docstring(class_node) or ""
|
| 137 |
+
|
| 138 |
+
# === 尝试收集类级别的变量定义 ===
|
| 139 |
+
class_vars = []
|
| 140 |
+
for node in class_node.body:
|
| 141 |
+
# 如果是赋值语句,且在方法定义之前 (通常 AST 是有序的)
|
| 142 |
+
if isinstance(node, (ast.Assign, ast.AnnAssign)):
|
| 143 |
+
seg = ast.get_source_segment(content, node)
|
| 144 |
+
if seg: class_vars.append(seg)
|
| 145 |
+
# 一旦遇到函数,就停止收集变量,避免把乱七八糟的逻辑也收进去
|
| 146 |
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 147 |
+
break
|
| 148 |
+
|
| 149 |
+
vars_text = "\n ".join(class_vars)
|
| 150 |
+
if vars_text:
|
| 151 |
+
vars_text = "\n " + vars_text # 缩进对齐
|
| 152 |
+
|
| 153 |
+
# 将变量拼接到 Header 中
|
| 154 |
+
context_header = f"class {class_name}:{vars_text}\n \"\"\"{docstring}\"\"\"\n # ... (Parent Context)\n"
|
| 155 |
+
|
| 156 |
+
for node in class_node.body:
|
| 157 |
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 158 |
+
method_code = ast.get_source_segment(content, node)
|
| 159 |
+
if not method_code: continue
|
| 160 |
+
|
| 161 |
+
full_chunk_content = context_header + "\n" + method_code
|
| 162 |
+
chunks.append(self._create_chunk(
|
| 163 |
+
full_chunk_content, file_path, "method", node.name, node.lineno, class_name
|
| 164 |
+
))
|
| 165 |
+
return chunks
|
| 166 |
+
|
| 167 |
+
def _chunk_c_style(self, content, file_path):
|
| 168 |
+
"""
|
| 169 |
+
解决宏干扰、全局变量丢失、跨行函数头问题
|
| 170 |
+
"""
|
| 171 |
+
chunks = []
|
| 172 |
+
if not content: return []
|
| 173 |
+
|
| 174 |
+
# === 1. 定义正则 Token ===
|
| 175 |
+
# 使用 Named Groups 避免 startswith 的模糊匹配
|
| 176 |
+
# 顺序至关重要:长匹配优先
|
| 177 |
+
token_pattern = re.compile(
|
| 178 |
+
r'(?P<BLOCK_COMMENT>/\*.*?\*/)|' # 块注释
|
| 179 |
+
r'(?P<LINE_COMMENT>//[^\n]*)|' # 行注释
|
| 180 |
+
r'(?P<STRING>"(?:\\.|[^"\\])*")|' # 双引号字符串
|
| 181 |
+
r'(?P<CHAR>\'(?:\\.|[^\'\\])*\')|' # 单引号字符
|
| 182 |
+
r'(?P<TEMPLATE>`(?:\\.|[^`\\])*`)|' # 反引号模板 (JS/Go)
|
| 183 |
+
r'(?P<MACRO>^\s*#.*(?:\\\n.*)*)|' # 宏定义 (支持跨行)
|
| 184 |
+
r'(?P<BRACE_OPEN>\{)|' # 开括号
|
| 185 |
+
r'(?P<BRACE_CLOSE>\})|' # 闭括号
|
| 186 |
+
r'(?P<SEMICOLON>;)', # 分号 (用于分割全局变量和函数头)
|
| 187 |
+
re.DOTALL | re.MULTILINE
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# 全局上下文收集器
|
| 191 |
+
global_context_parts = []
|
| 192 |
+
|
| 193 |
+
last_index = 0 # 上一个 Token 结束位置
|
| 194 |
+
block_start_index = 0 # 当前 Block (函数/类) 的签名开始位置
|
| 195 |
+
|
| 196 |
+
brace_balance = 0
|
| 197 |
+
in_structural_block = False # 是否在最外层的类/函数块内
|
| 198 |
+
|
| 199 |
+
# 暂存当前块的前置文本 (从上一个块结束 到 当前块开始)
|
| 200 |
+
# 这段文本里可能混杂着:全局变量、Import、以及当前函数的签名
|
| 201 |
+
pending_pre_text_start = 0
|
| 202 |
+
|
| 203 |
+
# 扫描
|
| 204 |
+
for match in token_pattern.finditer(content):
|
| 205 |
+
kind = match.lastgroup
|
| 206 |
+
start, end = match.span()
|
| 207 |
+
|
| 208 |
+
# 跳过非结构化 Token (注释、字符串、宏)
|
| 209 |
+
if kind in ('BLOCK_COMMENT', 'LINE_COMMENT', 'STRING', 'CHAR', 'TEMPLATE', 'MACRO'):
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
# 忽略括号 () 和 [],只认 {}。
|
| 213 |
+
# C-style 语言只有 {} 定义 Scope Body。忽略 () [] 是为了防止 if(a[i]){...} 误判。
|
| 214 |
+
# 只要 regex 不匹配 () [],它们就被视为普通文本,不会影响 brace_balance。
|
| 215 |
+
if kind == 'BRACE_OPEN':
|
| 216 |
+
if brace_balance == 0:
|
| 217 |
+
# === 发现一个新的顶层 Block ===
|
| 218 |
+
in_structural_block = True
|
| 219 |
+
|
| 220 |
+
# 1. 分析 "空隙文本" (从上一个块结束 到 这个 { 之前)
|
| 221 |
+
gap_text = content[pending_pre_text_start:start]
|
| 222 |
+
|
| 223 |
+
# [策略] 拆分 Global Context 和 Signature
|
| 224 |
+
# 寻找最后一个分号 ';' 或 '}' (在 gap_text 内部的逻辑结束点)
|
| 225 |
+
# 倒序查找比较安全。
|
| 226 |
+
# 如果找不到,说明整段 gap 都是签名 (e.g. void foo() {)
|
| 227 |
+
# 如果找到,分号前是 Global,分号后是 Signature
|
| 228 |
+
split_idx = gap_text.rfind(';')
|
| 229 |
+
if split_idx != -1:
|
| 230 |
+
# 分号前:归入全局上下文
|
| 231 |
+
global_part = gap_text[:split_idx+1].strip()
|
| 232 |
+
if global_part:
|
| 233 |
+
global_context_parts.append(global_part)
|
| 234 |
+
# 分号后:是当前函数的签名
|
| 235 |
+
# 自动处理了跨行函数头,因为 gap_text 包含换行
|
| 236 |
+
block_signature_start = pending_pre_text_start + split_idx + 1
|
| 237 |
+
else:
|
| 238 |
+
# 没有分号,假设全是签名 (e.g. 紧接着上一个块,或者是文件开头)
|
| 239 |
+
# 但要小心 include/import 等没有分号的语句 (Python 思维在 C 里不适用,C 几乎都有分号)
|
| 240 |
+
# Go 语言除外 (Go 没分号)。这里做一个简单的 heuristic:
|
| 241 |
+
# 如果是 Go/JS/TS,可能没有分号。暂且全部视为 Signature,
|
| 242 |
+
# 除非它看起来像 import。
|
| 243 |
+
# 这是一个 trade-off。
|
| 244 |
+
block_signature_start = pending_pre_text_start
|
| 245 |
+
|
| 246 |
+
# 记录当前 Block 真正的“视觉开始点” (包含签名)
|
| 247 |
+
block_start_index = block_signature_start
|
| 248 |
+
|
| 249 |
+
brace_balance += 1
|
| 250 |
+
|
| 251 |
+
elif kind == 'BRACE_CLOSE':
|
| 252 |
+
brace_balance -= 1
|
| 253 |
+
|
| 254 |
+
if brace_balance == 0 and in_structural_block:
|
| 255 |
+
# === 顶层 Block 结束 ===
|
| 256 |
+
in_structural_block = False
|
| 257 |
+
|
| 258 |
+
# 提取完整代码块 (Signature + Body)
|
| 259 |
+
# 范围:block_start_index -> end
|
| 260 |
+
full_block_text = content[block_start_index:end]
|
| 261 |
+
|
| 262 |
+
# 小块合并策略
|
| 263 |
+
# 如果块太小 (e.g. Getter/Setter),暂不生成 Chunk
|
| 264 |
+
# 架构决策:为了代码完整性,工业界 RAG 通常不建议丢弃小块,
|
| 265 |
+
# 尤其是 Getter/Setter 可能包含关键字段名。
|
| 266 |
+
# 这里我们生成 Chunk,但后续入库时可以由 Embedding 模型决定权重。
|
| 267 |
+
|
| 268 |
+
# 提取元数据
|
| 269 |
+
meta = self._extract_c_style_metadata(full_block_text)
|
| 270 |
+
start_line = content.count('\n', 0, block_start_index) + 1
|
| 271 |
+
|
| 272 |
+
chunks.append(self._create_chunk(
|
| 273 |
+
full_block_text, # 暂时不加 Global Header,最后统一加
|
| 274 |
+
file_path, meta["type"], meta["name"], start_line
|
| 275 |
+
))
|
| 276 |
+
|
| 277 |
+
# 更新游标:下一个块的前置文本从这里开始
|
| 278 |
+
pending_pre_text_start = end
|
| 279 |
+
|
| 280 |
+
# === 循环结束后的收尾 ===
|
| 281 |
+
# 处理文件末尾的剩余文本 (Tail)
|
| 282 |
+
tail_text = content[pending_pre_text_start:].strip()
|
| 283 |
+
if tail_text:
|
| 284 |
+
global_context_parts.append(tail_text)
|
| 285 |
+
|
| 286 |
+
# === Global Context 重排序 ===
|
| 287 |
+
# 目标顺序: Includes > Macros (#define) > Others (Typedefs/Vars)
|
| 288 |
+
# 简单策略:基于字符串内容的优先级排序
|
| 289 |
+
|
| 290 |
+
def context_priority(text):
|
| 291 |
+
text = text.strip()
|
| 292 |
+
if text.startswith("#include") or text.startswith("import") or text.startswith("using"):
|
| 293 |
+
return 0 # 最高优先级
|
| 294 |
+
if text.startswith("#define") or text.startswith("#macro"):
|
| 295 |
+
return 1 # 宏定义
|
| 296 |
+
if text.startswith("typedef") or text.startswith("enum") or text.startswith("struct"):
|
| 297 |
+
return 2 # 类型定义
|
| 298 |
+
return 3 # 普通全局变量和其他
|
| 299 |
+
|
| 300 |
+
# 稳定排序
|
| 301 |
+
global_context_parts.sort(key=context_priority)
|
| 302 |
+
|
| 303 |
+
# === 组装与注入 ===
|
| 304 |
+
full_global_context = "\n".join(global_context_parts).strip()
|
| 305 |
+
|
| 306 |
+
should_inject = len(full_global_context) <= self.config.max_context_chars
|
| 307 |
+
|
| 308 |
+
context_header = ""
|
| 309 |
+
if full_global_context and should_inject:
|
| 310 |
+
context_header = f"/* --- Global Context --- */\n{full_global_context}\n/* ---------------------- */\n"
|
| 311 |
+
|
| 312 |
+
for chunk in chunks:
|
| 313 |
+
chunk["content"] = context_header + chunk["content"]
|
| 314 |
+
|
| 315 |
+
if (full_global_context and not should_inject) or (not chunks and full_global_context):
|
| 316 |
+
chunks.insert(0, self._create_chunk(
|
| 317 |
+
full_global_context, file_path, "global_context", "header", 1
|
| 318 |
+
))
|
| 319 |
+
|
| 320 |
+
if not chunks:
|
| 321 |
+
return self._fallback_chunking(content, file_path)
|
| 322 |
+
|
| 323 |
+
return chunks
|
| 324 |
+
|
| 325 |
+
def _extract_c_style_metadata(self, code_block):
|
| 326 |
+
"""
|
| 327 |
+
从包含签名的代码块中提取元数据 (支持多行签名)
|
| 328 |
+
"""
|
| 329 |
+
# 截取到第一个 { 为止
|
| 330 |
+
header_part = code_block.split('{')[0]
|
| 331 |
+
# 压缩多余空白,变成单行以便正则匹配
|
| 332 |
+
header_clean = " ".join(header_part.split())
|
| 333 |
+
|
| 334 |
+
# 1. Class/Struct/Interface
|
| 335 |
+
type_pattern = re.compile(r'\b(class|struct|interface|enum|record|type)\s+([a-zA-Z0-9_]+)')
|
| 336 |
+
match = type_pattern.search(header_clean)
|
| 337 |
+
if match:
|
| 338 |
+
return {"type": "class", "name": match.group(2)}
|
| 339 |
+
|
| 340 |
+
# 2. Function
|
| 341 |
+
# 匹配: 单词 + (
|
| 342 |
+
# 排除关键字: if, for, while, switch, catch, return
|
| 343 |
+
func_pattern = re.compile(r'\b([a-zA-Z0-9_]+)\s*\(')
|
| 344 |
+
for match in func_pattern.finditer(header_clean):
|
| 345 |
+
name = match.group(1)
|
| 346 |
+
if name not in {'if', 'for', 'while', 'switch', 'catch', 'return', 'sizeof'}:
|
| 347 |
+
return {"type": "function", "name": name}
|
| 348 |
+
|
| 349 |
+
return {"type": "code_block", "name": "anonymous"}
|
| 350 |
+
|
| 351 |
+
def _fallback_chunking(self, content, file_path):
|
| 352 |
+
"""兜底策略:使用 Config 中的行数设置"""
|
| 353 |
+
chunks = []
|
| 354 |
+
lines = content.split('\n')
|
| 355 |
+
chunk_size = self.config.fallback_line_size
|
| 356 |
+
|
| 357 |
+
for i in range(0, len(lines), chunk_size):
|
| 358 |
+
chunk_content = "\n".join(lines[i:i+chunk_size])
|
| 359 |
+
chunks.append(self._create_chunk(chunk_content, file_path, "text_chunk", f"chunk_{i}", i+1))
|
| 360 |
+
return chunks
|
| 361 |
+
|
| 362 |
+
def _create_chunk(self, content, file_path, type_, name, start_line, class_name=""):
|
| 363 |
+
return {
|
| 364 |
+
"content": content,
|
| 365 |
+
"metadata": {
|
| 366 |
+
"file": file_path,
|
| 367 |
+
"type": type_,
|
| 368 |
+
"name": name,
|
| 369 |
+
"start_line": start_line,
|
| 370 |
+
"class": class_name
|
| 371 |
+
}
|
| 372 |
+
}
|
app/services/github_service.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
GitHub 服务层
|
| 4 |
+
|
| 5 |
+
职责:
|
| 6 |
+
- 提供业务级别的 GitHub 操作
|
| 7 |
+
- 封装底层客户端,提供简洁 API
|
| 8 |
+
- 保持向后兼容的函数签名
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
from typing import List, Optional, Dict
|
| 13 |
+
|
| 14 |
+
from app.utils.github_client import (
|
| 15 |
+
GitHubClient,
|
| 16 |
+
GitHubRepo,
|
| 17 |
+
GitHubFile,
|
| 18 |
+
FileFilter,
|
| 19 |
+
GitHubError,
|
| 20 |
+
GitHubNotFoundError,
|
| 21 |
+
get_github_client,
|
| 22 |
+
parse_repo_url,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ============================================================
|
| 29 |
+
# 服务类
|
| 30 |
+
# ============================================================
|
| 31 |
+
|
| 32 |
+
class GitHubService:
|
| 33 |
+
"""
|
| 34 |
+
GitHub 服务
|
| 35 |
+
|
| 36 |
+
提供高层业务操作,内部使用异步客户端。
|
| 37 |
+
|
| 38 |
+
使用示例:
|
| 39 |
+
```python
|
| 40 |
+
service = GitHubService()
|
| 41 |
+
|
| 42 |
+
# 获取仓库文件列表
|
| 43 |
+
files = await service.get_repo_structure("https://github.com/owner/repo")
|
| 44 |
+
|
| 45 |
+
# 获取文件内容
|
| 46 |
+
content = await service.get_file_content(
|
| 47 |
+
"https://github.com/owner/repo",
|
| 48 |
+
"src/main.py"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# 批量获取文件
|
| 52 |
+
contents = await service.get_files_content(
|
| 53 |
+
"https://github.com/owner/repo",
|
| 54 |
+
["README.md", "src/main.py", "requirements.txt"]
|
| 55 |
+
)
|
| 56 |
+
```
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
def __init__(self, client: Optional[GitHubClient] = None):
|
| 60 |
+
self._client = client
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def client(self) -> GitHubClient:
|
| 64 |
+
"""获取客户端 (延迟初始化)"""
|
| 65 |
+
if self._client is None:
|
| 66 |
+
self._client = get_github_client()
|
| 67 |
+
return self._client
|
| 68 |
+
|
| 69 |
+
async def _get_repo_from_url(self, repo_url: str) -> GitHubRepo:
|
| 70 |
+
"""从 URL 获取仓库对象"""
|
| 71 |
+
parsed = parse_repo_url(repo_url)
|
| 72 |
+
if not parsed:
|
| 73 |
+
raise ValueError(f"无效的 GitHub URL: {repo_url}")
|
| 74 |
+
|
| 75 |
+
owner, name = parsed
|
| 76 |
+
return await self.client.get_repo(owner, name)
|
| 77 |
+
|
| 78 |
+
async def get_repo_structure(
|
| 79 |
+
self,
|
| 80 |
+
repo_url: str,
|
| 81 |
+
file_filter: Optional[FileFilter] = None
|
| 82 |
+
) -> List[str]:
|
| 83 |
+
"""
|
| 84 |
+
获取仓库文件列表
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
repo_url: GitHub 仓库 URL
|
| 88 |
+
file_filter: 自定义文件过滤器
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
文件路径列表
|
| 92 |
+
"""
|
| 93 |
+
repo = await self._get_repo_from_url(repo_url)
|
| 94 |
+
files = await self.client.get_repo_tree(repo, file_filter)
|
| 95 |
+
return [f.path for f in files]
|
| 96 |
+
|
| 97 |
+
async def get_file_content(
|
| 98 |
+
self,
|
| 99 |
+
repo_url: str,
|
| 100 |
+
file_path: str
|
| 101 |
+
) -> Optional[str]:
|
| 102 |
+
"""
|
| 103 |
+
获取单个文件内容
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
repo_url: GitHub 仓库 URL
|
| 107 |
+
file_path: 文件路径
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
文件内容,失败返回 None
|
| 111 |
+
"""
|
| 112 |
+
repo = await self._get_repo_from_url(repo_url)
|
| 113 |
+
return await self.client.get_file_content(repo, file_path)
|
| 114 |
+
|
| 115 |
+
async def get_files_content(
|
| 116 |
+
self,
|
| 117 |
+
repo_url: str,
|
| 118 |
+
file_paths: List[str]
|
| 119 |
+
) -> Dict[str, Optional[str]]:
|
| 120 |
+
"""
|
| 121 |
+
批量获取文件内容 (并发)
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
repo_url: GitHub 仓库 URL
|
| 125 |
+
file_paths: 文件路径列表
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
{path: content} 字典
|
| 129 |
+
"""
|
| 130 |
+
repo = await self._get_repo_from_url(repo_url)
|
| 131 |
+
return await self.client.get_files_content(repo, file_paths, show_progress=True)
|
| 132 |
+
|
| 133 |
+
async def get_repo_info(self, repo_url: str) -> GitHubRepo:
|
| 134 |
+
"""
|
| 135 |
+
获取仓库基本信息
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
repo_url: GitHub 仓库 URL
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
GitHubRepo 对象
|
| 142 |
+
"""
|
| 143 |
+
return await self._get_repo_from_url(repo_url)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# ============================================================
|
| 147 |
+
# 全局服务实例
|
| 148 |
+
# ============================================================
|
| 149 |
+
|
| 150 |
+
_github_service: Optional[GitHubService] = None
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def get_github_service() -> GitHubService:
|
| 154 |
+
"""获取 GitHub 服务单例"""
|
| 155 |
+
global _github_service
|
| 156 |
+
if _github_service is None:
|
| 157 |
+
_github_service = GitHubService()
|
| 158 |
+
return _github_service
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# ============================================================
|
| 162 |
+
# 兼容旧接口 (同步风格的函数签名,但返回协程)
|
| 163 |
+
# ============================================================
|
| 164 |
+
|
| 165 |
+
# 保留 parse_repo_url 的旧签名兼容
|
| 166 |
+
def parse_repo_url_compat(url: str) -> Optional[str]:
|
| 167 |
+
"""
|
| 168 |
+
解析 GitHub URL (兼容旧接口)
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
"owner/repo" 字符串,无效返回 None
|
| 172 |
+
"""
|
| 173 |
+
result = parse_repo_url(url)
|
| 174 |
+
if result:
|
| 175 |
+
return f"{result[0]}/{result[1]}"
|
| 176 |
+
return None
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
async def get_repo_structure(repo_url: str) -> List[str]:
|
| 180 |
+
"""
|
| 181 |
+
获取仓库文件列表 (兼容旧接口)
|
| 182 |
+
|
| 183 |
+
注意: 这是一个异步函数,需要 await 调用
|
| 184 |
+
"""
|
| 185 |
+
service = get_github_service()
|
| 186 |
+
return await service.get_repo_structure(repo_url)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
async def get_file_content(repo_url: str, file_path: str) -> Optional[str]:
|
| 190 |
+
"""
|
| 191 |
+
获取文件内容 (兼容旧接口)
|
| 192 |
+
|
| 193 |
+
注意: 这是一个异步函数,需要 await 调用
|
| 194 |
+
"""
|
| 195 |
+
service = get_github_service()
|
| 196 |
+
return await service.get_file_content(repo_url, file_path)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# 导出
|
| 200 |
+
__all__ = [
|
| 201 |
+
"GitHubService",
|
| 202 |
+
"get_github_service",
|
| 203 |
+
"get_repo_structure",
|
| 204 |
+
"get_file_content",
|
| 205 |
+
"parse_repo_url_compat",
|
| 206 |
+
"GitHubError",
|
| 207 |
+
"GitHubNotFoundError",
|
| 208 |
+
"FileFilter",
|
| 209 |
+
"GitHubRepo",
|
| 210 |
+
]
|
app/services/tracing_service.py
ADDED
|
@@ -0,0 +1,549 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/services/tracing_service.py
|
| 2 |
+
"""
|
| 3 |
+
Langfuse集成模块 - 用于端到端追踪和观测
|
| 4 |
+
|
| 5 |
+
核心能力:
|
| 6 |
+
1. 自动捕获每一步的延迟、Token成本、输入输出
|
| 7 |
+
2. 记录完整的调用链路: Query -> Rewrite -> Retrieval -> Generation
|
| 8 |
+
3. 记录Tool调用和参数
|
| 9 |
+
4. 集成到评估流程
|
| 10 |
+
|
| 11 |
+
Langfuse支持:
|
| 12 |
+
- 本地部署 (docker run ... langfuse)
|
| 13 |
+
- 云端托管 (app.langfuse.com)
|
| 14 |
+
|
| 15 |
+
Author: Dexter
|
| 16 |
+
Date: 2025-01-27
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import time
|
| 20 |
+
import json
|
| 21 |
+
import os
|
| 22 |
+
from typing import Dict, Any, Optional, List, Callable
|
| 23 |
+
from functools import wraps
|
| 24 |
+
from datetime import datetime
|
| 25 |
+
from dataclasses import dataclass
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ============================================================================
|
| 29 |
+
# 第一部分: Langfuse客户端初始化 (可选)
|
| 30 |
+
# ============================================================================
|
| 31 |
+
|
| 32 |
+
LANGFUSE_IMPORT_ERROR = None
|
| 33 |
+
_LANGFUSE_ENABLED_ENV = os.getenv("LANGFUSE_ENABLED", "true").strip().lower()
|
| 34 |
+
_LANGFUSE_ENABLED = _LANGFUSE_ENABLED_ENV not in {"0", "false", "no", "off"}
|
| 35 |
+
|
| 36 |
+
if _LANGFUSE_ENABLED:
|
| 37 |
+
try:
|
| 38 |
+
from langfuse import Langfuse
|
| 39 |
+
from langfuse.decorators import observe, langfuse_context
|
| 40 |
+
LANGFUSE_AVAILABLE = True
|
| 41 |
+
except Exception as e:
|
| 42 |
+
LANGFUSE_IMPORT_ERROR = e
|
| 43 |
+
LANGFUSE_AVAILABLE = False
|
| 44 |
+
else:
|
| 45 |
+
LANGFUSE_AVAILABLE = False
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class TracingConfig:
|
| 50 |
+
"""追踪配置"""
|
| 51 |
+
enabled: bool = True
|
| 52 |
+
backend: str = "langfuse" # "langfuse" or "local"
|
| 53 |
+
langfuse_host: str = os.getenv("LANGFUSE_HOST", "http://localhost:3000")
|
| 54 |
+
langfuse_public_key: str = os.getenv("LANGFUSE_PUBLIC_KEY", "")
|
| 55 |
+
langfuse_secret_key: str = os.getenv("LANGFUSE_SECRET_KEY", "")
|
| 56 |
+
capture_token_usage: bool = True
|
| 57 |
+
capture_latency: bool = True
|
| 58 |
+
local_log_dir: str = "logs/traces"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class TracingService:
|
| 62 |
+
"""
|
| 63 |
+
统一的追踪服务
|
| 64 |
+
支持Langfuse和本地日志两种后端
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
def __init__(self, config: TracingConfig = None):
|
| 68 |
+
self.config = config or TracingConfig()
|
| 69 |
+
self.langfuse_client = None
|
| 70 |
+
self.current_trace_id = None
|
| 71 |
+
|
| 72 |
+
if self.config.enabled and self.config.backend == "langfuse":
|
| 73 |
+
if not LANGFUSE_AVAILABLE:
|
| 74 |
+
print("⚠️ Langfuse not installed. Install with: pip install langfuse. Falling back to local logging.")
|
| 75 |
+
self.config.backend = "local"
|
| 76 |
+
else:
|
| 77 |
+
try:
|
| 78 |
+
self.langfuse_client = Langfuse(
|
| 79 |
+
host=self.config.langfuse_host,
|
| 80 |
+
public_key=self.config.langfuse_public_key,
|
| 81 |
+
secret_key=self.config.langfuse_secret_key,
|
| 82 |
+
enabled=True,
|
| 83 |
+
debug=False
|
| 84 |
+
)
|
| 85 |
+
print("✅ Langfuse client initialized successfully")
|
| 86 |
+
except Exception as e:
|
| 87 |
+
print(f"⚠️ Langfuse initialization failed: {e}. Falling back to local logging.")
|
| 88 |
+
self.config.backend = "local"
|
| 89 |
+
|
| 90 |
+
# 创建本地日志目录
|
| 91 |
+
os.makedirs(self.config.local_log_dir, exist_ok=True)
|
| 92 |
+
|
| 93 |
+
def start_trace(self, trace_name: str, session_id: str, metadata: Dict = None) -> str:
|
| 94 |
+
"""启动一个新的追踪链"""
|
| 95 |
+
import uuid
|
| 96 |
+
trace_id = str(uuid.uuid4())
|
| 97 |
+
self.current_trace_id = trace_id
|
| 98 |
+
|
| 99 |
+
if self.langfuse_client:
|
| 100 |
+
self.langfuse_client.trace(
|
| 101 |
+
name=trace_name,
|
| 102 |
+
input=metadata or {},
|
| 103 |
+
session_id=session_id
|
| 104 |
+
)
|
| 105 |
+
print(f"📍 Trace started: {trace_id}")
|
| 106 |
+
else:
|
| 107 |
+
self._log_locally("trace_start", {
|
| 108 |
+
"trace_id": trace_id,
|
| 109 |
+
"name": trace_name,
|
| 110 |
+
"session_id": session_id,
|
| 111 |
+
"metadata": metadata,
|
| 112 |
+
"timestamp": datetime.now().isoformat()
|
| 113 |
+
})
|
| 114 |
+
|
| 115 |
+
return trace_id
|
| 116 |
+
|
| 117 |
+
def record_span(
|
| 118 |
+
self,
|
| 119 |
+
span_name: str,
|
| 120 |
+
operation: str,
|
| 121 |
+
input_data: Any,
|
| 122 |
+
output_data: Any,
|
| 123 |
+
latency_ms: float,
|
| 124 |
+
token_usage: Dict[str, int] = None,
|
| 125 |
+
metadata: Dict = None
|
| 126 |
+
) -> None:
|
| 127 |
+
"""记录一个操作的跨度"""
|
| 128 |
+
|
| 129 |
+
span_record = {
|
| 130 |
+
"span_name": span_name,
|
| 131 |
+
"operation": operation,
|
| 132 |
+
"latency_ms": latency_ms,
|
| 133 |
+
"timestamp": datetime.now().isoformat(),
|
| 134 |
+
"token_usage": token_usage or {},
|
| 135 |
+
"metadata": metadata or {}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
if self.langfuse_client:
|
| 139 |
+
try:
|
| 140 |
+
# Langfuse:记录到云端
|
| 141 |
+
self.langfuse_client.span(
|
| 142 |
+
name=span_name,
|
| 143 |
+
input=input_data,
|
| 144 |
+
output=output_data,
|
| 145 |
+
metadata={
|
| 146 |
+
"operation": operation,
|
| 147 |
+
"latency_ms": latency_ms,
|
| 148 |
+
**(token_usage or {}),
|
| 149 |
+
**(metadata or {})
|
| 150 |
+
}
|
| 151 |
+
)
|
| 152 |
+
except Exception as e:
|
| 153 |
+
print(f"⚠️ Failed to record span to Langfuse: {e}")
|
| 154 |
+
|
| 155 |
+
# 本地日志
|
| 156 |
+
self._log_locally("span", span_record)
|
| 157 |
+
|
| 158 |
+
def record_tool_call(
|
| 159 |
+
self,
|
| 160 |
+
tool_name: str,
|
| 161 |
+
parameters: Dict,
|
| 162 |
+
result: Any,
|
| 163 |
+
latency_ms: float,
|
| 164 |
+
success: bool,
|
| 165 |
+
error: str = None
|
| 166 |
+
) -> None:
|
| 167 |
+
"""记录工具调用"""
|
| 168 |
+
|
| 169 |
+
tool_record = {
|
| 170 |
+
"tool_name": tool_name,
|
| 171 |
+
"parameters": parameters,
|
| 172 |
+
"result": str(result)[:500] if result else None,
|
| 173 |
+
"latency_ms": latency_ms,
|
| 174 |
+
"success": success,
|
| 175 |
+
"error": error,
|
| 176 |
+
"timestamp": datetime.now().isoformat()
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
if self.langfuse_client:
|
| 180 |
+
try:
|
| 181 |
+
self.langfuse_client.event(
|
| 182 |
+
name=f"tool_call:{tool_name}",
|
| 183 |
+
input=parameters,
|
| 184 |
+
output=result,
|
| 185 |
+
metadata={
|
| 186 |
+
"latency_ms": latency_ms,
|
| 187 |
+
"success": success,
|
| 188 |
+
"error": error
|
| 189 |
+
}
|
| 190 |
+
)
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f"⚠️ Failed to record tool call: {e}")
|
| 193 |
+
|
| 194 |
+
self._log_locally("tool_call", tool_record)
|
| 195 |
+
|
| 196 |
+
def record_retrieval_debug(
|
| 197 |
+
self,
|
| 198 |
+
query: str,
|
| 199 |
+
retrieved_files: List[str],
|
| 200 |
+
vector_scores: List[float],
|
| 201 |
+
bm25_scores: List[float],
|
| 202 |
+
latency_ms: float
|
| 203 |
+
) -> None:
|
| 204 |
+
"""记录检索过程的调试信息"""
|
| 205 |
+
|
| 206 |
+
retrieval_record = {
|
| 207 |
+
"query": query,
|
| 208 |
+
"retrieved_count": len(retrieved_files),
|
| 209 |
+
"files": retrieved_files,
|
| 210 |
+
"vector_scores": vector_scores,
|
| 211 |
+
"bm25_scores": bm25_scores,
|
| 212 |
+
"latency_ms": latency_ms,
|
| 213 |
+
"timestamp": datetime.now().isoformat()
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
if self.langfuse_client:
|
| 217 |
+
try:
|
| 218 |
+
self.langfuse_client.event(
|
| 219 |
+
name="retrieval_debug",
|
| 220 |
+
input={"query": query},
|
| 221 |
+
output={"files": retrieved_files},
|
| 222 |
+
metadata=retrieval_record
|
| 223 |
+
)
|
| 224 |
+
except Exception as e:
|
| 225 |
+
print(f"⚠️ Failed to record retrieval debug: {e}")
|
| 226 |
+
|
| 227 |
+
self._log_locally("retrieval", retrieval_record)
|
| 228 |
+
|
| 229 |
+
def record_llm_generation(
|
| 230 |
+
self,
|
| 231 |
+
model: str,
|
| 232 |
+
prompt_messages: List[Dict],
|
| 233 |
+
generated_text: str,
|
| 234 |
+
ttft_ms: float = None,
|
| 235 |
+
total_latency_ms: float = None,
|
| 236 |
+
prompt_tokens: int = None,
|
| 237 |
+
completion_tokens: int = None,
|
| 238 |
+
total_tokens: int = None,
|
| 239 |
+
is_streaming: bool = False,
|
| 240 |
+
metadata: Dict = None
|
| 241 |
+
) -> None:
|
| 242 |
+
"""
|
| 243 |
+
记录 LLM 生成的完整信息,包括 Token 消耗和 TTFT
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
model: 模型名称 (如 "gpt-4", "claude-3")
|
| 247 |
+
prompt_messages: 发送给 LLM 的消息列表
|
| 248 |
+
generated_text: 生成的文本(可截断)
|
| 249 |
+
ttft_ms: Time To First Token,首 token 延迟(毫秒)
|
| 250 |
+
total_latency_ms: 总生成延迟(毫秒)
|
| 251 |
+
prompt_tokens: 输入 token 数
|
| 252 |
+
completion_tokens: 输出 token 数
|
| 253 |
+
total_tokens: 总 token 数
|
| 254 |
+
is_streaming: 是否流式输出
|
| 255 |
+
metadata: 额外元数据
|
| 256 |
+
"""
|
| 257 |
+
llm_record = {
|
| 258 |
+
"model": model,
|
| 259 |
+
"is_streaming": is_streaming,
|
| 260 |
+
"prompt_preview": str(prompt_messages)[:500], # 截断避免日志过大
|
| 261 |
+
"generated_preview": generated_text[:500] if generated_text else "",
|
| 262 |
+
"generated_length": len(generated_text) if generated_text else 0,
|
| 263 |
+
# Token 统计
|
| 264 |
+
"token_usage": {
|
| 265 |
+
"prompt_tokens": prompt_tokens,
|
| 266 |
+
"completion_tokens": completion_tokens,
|
| 267 |
+
"total_tokens": total_tokens
|
| 268 |
+
},
|
| 269 |
+
# 延迟统计
|
| 270 |
+
"latency": {
|
| 271 |
+
"ttft_ms": ttft_ms, # Time To First Token
|
| 272 |
+
"total_ms": total_latency_ms,
|
| 273 |
+
"tokens_per_second": round(completion_tokens / (total_latency_ms / 1000), 2)
|
| 274 |
+
if completion_tokens and total_latency_ms and total_latency_ms > 0 else None
|
| 275 |
+
},
|
| 276 |
+
"timestamp": datetime.now().isoformat(),
|
| 277 |
+
"metadata": metadata or {}
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
if self.langfuse_client:
|
| 281 |
+
try:
|
| 282 |
+
self.langfuse_client.generation(
|
| 283 |
+
name="llm_generation",
|
| 284 |
+
model=model,
|
| 285 |
+
input=prompt_messages,
|
| 286 |
+
output=generated_text[:1000] if generated_text else "",
|
| 287 |
+
usage={
|
| 288 |
+
"prompt_tokens": prompt_tokens or 0,
|
| 289 |
+
"completion_tokens": completion_tokens or 0,
|
| 290 |
+
"total_tokens": total_tokens or 0
|
| 291 |
+
},
|
| 292 |
+
metadata={
|
| 293 |
+
"ttft_ms": ttft_ms,
|
| 294 |
+
"total_latency_ms": total_latency_ms,
|
| 295 |
+
"is_streaming": is_streaming,
|
| 296 |
+
**(metadata or {})
|
| 297 |
+
}
|
| 298 |
+
)
|
| 299 |
+
except Exception as e:
|
| 300 |
+
print(f"⚠️ Failed to record LLM generation to Langfuse: {e}")
|
| 301 |
+
|
| 302 |
+
self._log_locally("llm_generation", llm_record)
|
| 303 |
+
|
| 304 |
+
def record_ttft(self, ttft_ms: float, model: str = None, metadata: Dict = None) -> None:
|
| 305 |
+
"""
|
| 306 |
+
单独记录 TTFT (Time To First Token)
|
| 307 |
+
用于流式生成时在收到第一个 token 时立即记录
|
| 308 |
+
|
| 309 |
+
Args:
|
| 310 |
+
ttft_ms: 首 token 延迟(毫秒)
|
| 311 |
+
model: 模型名称
|
| 312 |
+
metadata: 额外元数据
|
| 313 |
+
"""
|
| 314 |
+
ttft_record = {
|
| 315 |
+
"ttft_ms": ttft_ms,
|
| 316 |
+
"model": model,
|
| 317 |
+
"timestamp": datetime.now().isoformat(),
|
| 318 |
+
"metadata": metadata or {}
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
if self.langfuse_client:
|
| 322 |
+
try:
|
| 323 |
+
self.langfuse_client.event(
|
| 324 |
+
name="ttft",
|
| 325 |
+
input={},
|
| 326 |
+
output={"ttft_ms": ttft_ms},
|
| 327 |
+
metadata=ttft_record
|
| 328 |
+
)
|
| 329 |
+
except Exception as e:
|
| 330 |
+
print(f"⚠️ Failed to record TTFT: {e}")
|
| 331 |
+
|
| 332 |
+
self._log_locally("ttft", ttft_record)
|
| 333 |
+
|
| 334 |
+
def add_event(self, event_name: str, event_data: Dict[str, Any] = None) -> None:
|
| 335 |
+
"""
|
| 336 |
+
添加事件记录
|
| 337 |
+
|
| 338 |
+
Args:
|
| 339 |
+
event_name: 事件名称 (如 "repo_map_generated", "file_read_failed" 等)
|
| 340 |
+
event_data: 事件相关数据
|
| 341 |
+
"""
|
| 342 |
+
event_record = {
|
| 343 |
+
"event_name": event_name,
|
| 344 |
+
"event_data": event_data or {},
|
| 345 |
+
"timestamp": datetime.now().isoformat()
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
if self.langfuse_client:
|
| 349 |
+
try:
|
| 350 |
+
self.langfuse_client.event(
|
| 351 |
+
name=event_name,
|
| 352 |
+
input={},
|
| 353 |
+
output=event_data or {},
|
| 354 |
+
metadata=event_data or {}
|
| 355 |
+
)
|
| 356 |
+
except Exception as e:
|
| 357 |
+
print(f"⚠️ Failed to record event '{event_name}': {e}")
|
| 358 |
+
|
| 359 |
+
self._log_locally("event", event_record)
|
| 360 |
+
|
| 361 |
+
def _log_locally(self, log_type: str, data: Dict) -> None:
|
| 362 |
+
"""本地日志记录"""
|
| 363 |
+
log_file = os.path.join(
|
| 364 |
+
self.config.local_log_dir,
|
| 365 |
+
f"{log_type}_{datetime.now().strftime('%Y%m%d')}.jsonl"
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
with open(log_file, 'a', encoding='utf-8') as f:
|
| 369 |
+
f.write(json.dumps(data, ensure_ascii=False, default=str) + '\n')
|
| 370 |
+
|
| 371 |
+
def get_trace_url(self, trace_id: str = None) -> str:
|
| 372 |
+
"""获取Langfuse中该trace的URL (用于前端跳转)"""
|
| 373 |
+
if not self.langfuse_client or not trace_id:
|
| 374 |
+
return None
|
| 375 |
+
|
| 376 |
+
# Langfuse云端URL格式
|
| 377 |
+
return f"{self.config.langfuse_host}/traces/{trace_id}"
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
# ============================================================================
|
| 381 |
+
# 第二部分: 装饰器 - 自动追踪
|
| 382 |
+
# ============================================================================
|
| 383 |
+
|
| 384 |
+
def traced(operation_name: str, capture_args: List[str] = None):
|
| 385 |
+
"""
|
| 386 |
+
装饰器: 自动为被装饰函数添加追踪
|
| 387 |
+
|
| 388 |
+
使用示例:
|
| 389 |
+
@traced("query_rewrite", capture_args=["user_query"])
|
| 390 |
+
async def rewrite_query(user_query: str):
|
| 391 |
+
...
|
| 392 |
+
"""
|
| 393 |
+
|
| 394 |
+
def decorator(func: Callable):
|
| 395 |
+
@wraps(func)
|
| 396 |
+
async def async_wrapper(*args, **kwargs):
|
| 397 |
+
start_time = time.time()
|
| 398 |
+
|
| 399 |
+
# 捕获输入参数
|
| 400 |
+
input_data = {}
|
| 401 |
+
if capture_args:
|
| 402 |
+
for arg_name in capture_args:
|
| 403 |
+
if arg_name in kwargs:
|
| 404 |
+
input_data[arg_name] = kwargs[arg_name]
|
| 405 |
+
|
| 406 |
+
try:
|
| 407 |
+
result = await func(*args, **kwargs)
|
| 408 |
+
latency_ms = (time.time() - start_time) * 1000
|
| 409 |
+
|
| 410 |
+
# 记录跨度
|
| 411 |
+
tracing_service.record_span(
|
| 412 |
+
span_name=operation_name,
|
| 413 |
+
operation=func.__name__,
|
| 414 |
+
input_data=input_data,
|
| 415 |
+
output_data={"success": True},
|
| 416 |
+
latency_ms=latency_ms
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
return result
|
| 420 |
+
except Exception as e:
|
| 421 |
+
latency_ms = (time.time() - start_time) * 1000
|
| 422 |
+
tracing_service.record_span(
|
| 423 |
+
span_name=operation_name,
|
| 424 |
+
operation=func.__name__,
|
| 425 |
+
input_data=input_data,
|
| 426 |
+
output_data={"error": str(e)},
|
| 427 |
+
latency_ms=latency_ms,
|
| 428 |
+
metadata={"error": True}
|
| 429 |
+
)
|
| 430 |
+
raise
|
| 431 |
+
|
| 432 |
+
@wraps(func)
|
| 433 |
+
def sync_wrapper(*args, **kwargs):
|
| 434 |
+
start_time = time.time()
|
| 435 |
+
|
| 436 |
+
input_data = {}
|
| 437 |
+
if capture_args:
|
| 438 |
+
for arg_name in capture_args:
|
| 439 |
+
if arg_name in kwargs:
|
| 440 |
+
input_data[arg_name] = kwargs[arg_name]
|
| 441 |
+
|
| 442 |
+
try:
|
| 443 |
+
result = func(*args, **kwargs)
|
| 444 |
+
latency_ms = (time.time() - start_time) * 1000
|
| 445 |
+
|
| 446 |
+
tracing_service.record_span(
|
| 447 |
+
span_name=operation_name,
|
| 448 |
+
operation=func.__name__,
|
| 449 |
+
input_data=input_data,
|
| 450 |
+
output_data={"success": True},
|
| 451 |
+
latency_ms=latency_ms
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
return result
|
| 455 |
+
except Exception as e:
|
| 456 |
+
latency_ms = (time.time() - start_time) * 1000
|
| 457 |
+
tracing_service.record_span(
|
| 458 |
+
span_name=operation_name,
|
| 459 |
+
operation=func.__name__,
|
| 460 |
+
input_data=input_data,
|
| 461 |
+
output_data={"error": str(e)},
|
| 462 |
+
latency_ms=latency_ms,
|
| 463 |
+
metadata={"error": True}
|
| 464 |
+
)
|
| 465 |
+
raise
|
| 466 |
+
|
| 467 |
+
# 判断是async还是sync
|
| 468 |
+
if asyncio.iscoroutinefunction(func):
|
| 469 |
+
return async_wrapper
|
| 470 |
+
else:
|
| 471 |
+
return sync_wrapper
|
| 472 |
+
|
| 473 |
+
return decorator
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
# ============================================================================
|
| 477 |
+
# 第三部分: 全局实例
|
| 478 |
+
# ============================================================================
|
| 479 |
+
|
| 480 |
+
tracing_config = TracingConfig(
|
| 481 |
+
enabled=True,
|
| 482 |
+
backend="langfuse" if LANGFUSE_AVAILABLE else "local"
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
tracing_service = TracingService(config=tracing_config)
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
# ============================================================================
|
| 489 |
+
# 第四部分: 集成示例 (如何在agent_service.py中使用)
|
| 490 |
+
# ============================================================================
|
| 491 |
+
|
| 492 |
+
"""
|
| 493 |
+
在你的agent_service.py中添加:
|
| 494 |
+
|
| 495 |
+
1. 导入追踪服务:
|
| 496 |
+
from app.services.tracing_service import tracing_service
|
| 497 |
+
|
| 498 |
+
2. 在agent_stream函数开始:
|
| 499 |
+
trace_id = tracing_service.start_trace(
|
| 500 |
+
trace_name="github_agent_analysis",
|
| 501 |
+
session_id=session_id,
|
| 502 |
+
metadata={"repo_url": repo_url, "language": language}
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
3. 在generate_repo_map函数周围:
|
| 506 |
+
start_time = time.time()
|
| 507 |
+
file_tree_str, mapped_files = await generate_repo_map(repo_url, file_list, limit=limit)
|
| 508 |
+
latency_ms = (time.time() - start_time) * 1000
|
| 509 |
+
|
| 510 |
+
tracing_service.record_span(
|
| 511 |
+
span_name="generate_repo_map",
|
| 512 |
+
operation="repo_mapping",
|
| 513 |
+
input_data={"file_count": len(file_list), "limit": limit},
|
| 514 |
+
output_data={"files_in_map": len(mapped_files)},
|
| 515 |
+
latency_ms=latency_ms
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
4. 在process_single_file中记录检索:
|
| 519 |
+
tracing_service.record_retrieval_debug(
|
| 520 |
+
query=search_query,
|
| 521 |
+
retrieved_files=valid_files,
|
| 522 |
+
vector_scores=vector_scores,
|
| 523 |
+
bm25_scores=bm25_scores,
|
| 524 |
+
latency_ms=search_latency
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
5. 工具调用记录:
|
| 528 |
+
start_time = time.time()
|
| 529 |
+
try:
|
| 530 |
+
result = get_file_content(repo_url, file_path)
|
| 531 |
+
tracing_service.record_tool_call(
|
| 532 |
+
tool_name="get_file_content",
|
| 533 |
+
parameters={"file_path": file_path},
|
| 534 |
+
result=result[:100] if result else None,
|
| 535 |
+
latency_ms=(time.time() - start_time) * 1000,
|
| 536 |
+
success=True
|
| 537 |
+
)
|
| 538 |
+
except Exception as e:
|
| 539 |
+
tracing_service.record_tool_call(
|
| 540 |
+
tool_name="get_file_content",
|
| 541 |
+
parameters={"file_path": file_path},
|
| 542 |
+
result=None,
|
| 543 |
+
latency_ms=(time.time() - start_time) * 1000,
|
| 544 |
+
success=False,
|
| 545 |
+
error=str(e)
|
| 546 |
+
)
|
| 547 |
+
"""
|
| 548 |
+
|
| 549 |
+
import asyncio
|
app/services/vector_service.py
ADDED
|
@@ -0,0 +1,676 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
向量服务层 - Qdrant 版
|
| 4 |
+
|
| 5 |
+
特性:
|
| 6 |
+
1. 混合搜索 - Qdrant 向量 + BM25 关键词,RRF 融合
|
| 7 |
+
2. 异步原生 - 全链路异步
|
| 8 |
+
3. 会话隔离 - 每个 session 独立集合
|
| 9 |
+
4. 状态持久化 - 仓库信息、BM25 索引缓存
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import asyncio
|
| 13 |
+
import json
|
| 14 |
+
import logging
|
| 15 |
+
import os
|
| 16 |
+
import pickle
|
| 17 |
+
import re
|
| 18 |
+
import tempfile
|
| 19 |
+
import time
|
| 20 |
+
from dataclasses import dataclass, field
|
| 21 |
+
from typing import List, Dict, Any, Optional, Set
|
| 22 |
+
|
| 23 |
+
from rank_bm25 import BM25Okapi
|
| 24 |
+
|
| 25 |
+
from app.core.config import settings
|
| 26 |
+
from app.storage.base import Document, SearchResult, CollectionStats
|
| 27 |
+
from app.storage.qdrant_store import QdrantVectorStore, QdrantConfig, get_qdrant_factory
|
| 28 |
+
from app.utils.embedding import get_embedding_service, EmbeddingConfig
|
| 29 |
+
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ============================================================
|
| 34 |
+
# 使用统一配置
|
| 35 |
+
# ============================================================
|
| 36 |
+
|
| 37 |
+
from app.core.config import vector_config as config
|
| 38 |
+
|
| 39 |
+
# 确保目录存在
|
| 40 |
+
os.makedirs(config.context_dir, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
# === 向后兼容导出 (供 main.py 使用) ===
|
| 43 |
+
vector_config = config # 兼容旧名称
|
| 44 |
+
CONTEXT_DIR = config.context_dir
|
| 45 |
+
QDRANT_DIR = config.data_dir # Qdrant 数据目录
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# ============================================================
|
| 49 |
+
# Embedding 服务
|
| 50 |
+
# ============================================================
|
| 51 |
+
|
| 52 |
+
_embedding_service = None
|
| 53 |
+
|
| 54 |
+
def get_embedding():
|
| 55 |
+
"""获取 Embedding 服务单例"""
|
| 56 |
+
global _embedding_service
|
| 57 |
+
if _embedding_service is None:
|
| 58 |
+
emb_config = EmbeddingConfig(
|
| 59 |
+
api_base_url=config.embedding_api_url,
|
| 60 |
+
model_name=config.embedding_model,
|
| 61 |
+
batch_size=config.embedding_batch_size,
|
| 62 |
+
max_text_length=config.embedding_max_length,
|
| 63 |
+
max_concurrent_batches=config.embedding_concurrency,
|
| 64 |
+
)
|
| 65 |
+
_embedding_service = get_embedding_service(emb_config)
|
| 66 |
+
return _embedding_service
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ============================================================
|
| 70 |
+
# 向量存储服务
|
| 71 |
+
# ============================================================
|
| 72 |
+
|
| 73 |
+
class VectorStore:
|
| 74 |
+
"""
|
| 75 |
+
向量存储服务
|
| 76 |
+
|
| 77 |
+
整合 Qdrant 向量搜索和 BM25 关键词搜索
|
| 78 |
+
|
| 79 |
+
使用示例:
|
| 80 |
+
```python
|
| 81 |
+
store = VectorStore("session_123")
|
| 82 |
+
await store.initialize()
|
| 83 |
+
|
| 84 |
+
# 重置 (分析新仓库时)
|
| 85 |
+
await store.reset()
|
| 86 |
+
|
| 87 |
+
# 添加文档
|
| 88 |
+
await store.add_documents(documents, metadatas)
|
| 89 |
+
|
| 90 |
+
# 混合搜索
|
| 91 |
+
results = await store.search_hybrid("how does auth work?")
|
| 92 |
+
|
| 93 |
+
await store.close()
|
| 94 |
+
```
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
def __init__(self, session_id: str):
|
| 98 |
+
self.session_id = self._sanitize_id(session_id)
|
| 99 |
+
self.collection_name = f"repo_{self.session_id}"
|
| 100 |
+
|
| 101 |
+
# Qdrant 存储
|
| 102 |
+
self._qdrant: Optional[QdrantVectorStore] = None
|
| 103 |
+
|
| 104 |
+
# BM25 索引 (内存)
|
| 105 |
+
self._bm25: Optional[BM25Okapi] = None
|
| 106 |
+
self._doc_store: List[Document] = []
|
| 107 |
+
self._indexed_files: Set[str] = set()
|
| 108 |
+
|
| 109 |
+
# 上下文
|
| 110 |
+
self.repo_url: Optional[str] = None
|
| 111 |
+
self.global_context: Dict[str, Any] = {}
|
| 112 |
+
|
| 113 |
+
# 文件路径
|
| 114 |
+
self._context_file = os.path.join(config.context_dir, f"{self.session_id}.json")
|
| 115 |
+
self._cache_file = os.path.join(config.context_dir, f"{self.session_id}_bm25.pkl")
|
| 116 |
+
|
| 117 |
+
self._initialized = False
|
| 118 |
+
|
| 119 |
+
@staticmethod
|
| 120 |
+
def _sanitize_id(session_id: str) -> str:
|
| 121 |
+
"""清理 session ID"""
|
| 122 |
+
clean = re.sub(r'[^a-zA-Z0-9_-]', '', session_id)
|
| 123 |
+
if not clean:
|
| 124 |
+
raise ValueError("Invalid session_id")
|
| 125 |
+
return clean
|
| 126 |
+
|
| 127 |
+
async def initialize(self) -> None:
|
| 128 |
+
"""初始化存储"""
|
| 129 |
+
if self._initialized:
|
| 130 |
+
return
|
| 131 |
+
|
| 132 |
+
# 初始化 Qdrant
|
| 133 |
+
factory = get_qdrant_factory()
|
| 134 |
+
self._qdrant = factory.create(self.collection_name)
|
| 135 |
+
await self._qdrant.initialize()
|
| 136 |
+
|
| 137 |
+
# 加载本地状态
|
| 138 |
+
await self._load_state()
|
| 139 |
+
|
| 140 |
+
self._initialized = True
|
| 141 |
+
logger.debug(f"✅ VectorStore 初始化: {self.session_id}")
|
| 142 |
+
|
| 143 |
+
async def close(self) -> None:
|
| 144 |
+
"""关闭连接"""
|
| 145 |
+
if self._qdrant:
|
| 146 |
+
await self._qdrant.close()
|
| 147 |
+
self._qdrant = None
|
| 148 |
+
self._initialized = False
|
| 149 |
+
|
| 150 |
+
async def _load_state(self) -> None:
|
| 151 |
+
"""加载状态"""
|
| 152 |
+
# 1. 加载上下文 JSON
|
| 153 |
+
if os.path.exists(self._context_file):
|
| 154 |
+
try:
|
| 155 |
+
with open(self._context_file, 'r', encoding='utf-8') as f:
|
| 156 |
+
data = json.load(f)
|
| 157 |
+
self.repo_url = data.get("repo_url")
|
| 158 |
+
self.global_context = data.get("global_context", {})
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.warning(f"加载上下文失败: {e}")
|
| 161 |
+
|
| 162 |
+
# 2. 尝试加载 BM25 缓存
|
| 163 |
+
cache_loaded = False
|
| 164 |
+
if os.path.exists(self._cache_file):
|
| 165 |
+
try:
|
| 166 |
+
with open(self._cache_file, 'rb') as f:
|
| 167 |
+
cache = pickle.load(f)
|
| 168 |
+
if isinstance(cache, dict) and cache.get("version") == config.cache_version:
|
| 169 |
+
self._bm25 = cache.get("bm25")
|
| 170 |
+
self._doc_store = cache.get("doc_store", [])
|
| 171 |
+
self._indexed_files = cache.get("indexed_files", set())
|
| 172 |
+
cache_loaded = True
|
| 173 |
+
logger.debug(f"📦 BM25 缓存命中: {len(self._doc_store)} 文档")
|
| 174 |
+
except Exception as e:
|
| 175 |
+
logger.warning(f"BM25 缓存损坏: {e}")
|
| 176 |
+
os.remove(self._cache_file)
|
| 177 |
+
|
| 178 |
+
# 3. 缓存未命中: 从 Qdrant 重建
|
| 179 |
+
if not cache_loaded and self._qdrant:
|
| 180 |
+
await self._rebuild_bm25_index()
|
| 181 |
+
|
| 182 |
+
async def _rebuild_bm25_index(self) -> None:
|
| 183 |
+
"""从 Qdrant 重建 BM25 索引"""
|
| 184 |
+
logger.info(f"🔄 重建 BM25 索引: {self.session_id}")
|
| 185 |
+
|
| 186 |
+
documents = await self._qdrant.get_all_documents()
|
| 187 |
+
|
| 188 |
+
if documents:
|
| 189 |
+
self._doc_store = documents
|
| 190 |
+
self._indexed_files = {doc.file_path for doc in documents if doc.file_path}
|
| 191 |
+
|
| 192 |
+
tokenized = [self._tokenize(doc.content) for doc in documents]
|
| 193 |
+
if tokenized:
|
| 194 |
+
self._bm25 = BM25Okapi(tokenized)
|
| 195 |
+
|
| 196 |
+
self._save_bm25_cache()
|
| 197 |
+
logger.info(f"✅ BM25 索引重建完成: {len(documents)} 文档")
|
| 198 |
+
|
| 199 |
+
def _save_bm25_cache(self) -> None:
|
| 200 |
+
"""保存 BM25 缓存 (原子写入)"""
|
| 201 |
+
if not self._doc_store:
|
| 202 |
+
return
|
| 203 |
+
|
| 204 |
+
try:
|
| 205 |
+
fd, tmp_path = tempfile.mkstemp(dir=config.context_dir)
|
| 206 |
+
with os.fdopen(fd, 'wb') as f:
|
| 207 |
+
pickle.dump({
|
| 208 |
+
"version": config.cache_version,
|
| 209 |
+
"bm25": self._bm25,
|
| 210 |
+
"doc_store": self._doc_store,
|
| 211 |
+
"indexed_files": self._indexed_files,
|
| 212 |
+
}, f)
|
| 213 |
+
|
| 214 |
+
if os.path.exists(self._cache_file):
|
| 215 |
+
os.remove(self._cache_file)
|
| 216 |
+
os.rename(tmp_path, self._cache_file)
|
| 217 |
+
|
| 218 |
+
except Exception as e:
|
| 219 |
+
logger.error(f"保存 BM25 缓存失败: {e}")
|
| 220 |
+
|
| 221 |
+
def _tokenize(self, text: str) -> List[str]:
|
| 222 |
+
"""分词"""
|
| 223 |
+
return [
|
| 224 |
+
t.lower() for t in re.split(config.tokenize_regex, text)
|
| 225 |
+
if t.strip()
|
| 226 |
+
]
|
| 227 |
+
|
| 228 |
+
async def save_context(self, repo_url: str, context_data: Dict[str, Any]) -> None:
|
| 229 |
+
"""保存仓库上下文 (异步,不阻塞事件循环)"""
|
| 230 |
+
self.repo_url = repo_url
|
| 231 |
+
self.global_context = context_data
|
| 232 |
+
await asyncio.to_thread(self._write_context_file, {
|
| 233 |
+
"repo_url": repo_url,
|
| 234 |
+
"global_context": context_data,
|
| 235 |
+
})
|
| 236 |
+
|
| 237 |
+
def _write_context_file(self, updates: Dict[str, Any]) -> None:
|
| 238 |
+
"""写入上下文文件 (同步,供线程池调用)"""
|
| 239 |
+
try:
|
| 240 |
+
existing = {}
|
| 241 |
+
if os.path.exists(self._context_file):
|
| 242 |
+
with open(self._context_file, 'r', encoding='utf-8') as f:
|
| 243 |
+
existing = json.load(f)
|
| 244 |
+
existing.update(updates)
|
| 245 |
+
with open(self._context_file, 'w', encoding='utf-8') as f:
|
| 246 |
+
json.dump(existing, f, ensure_ascii=False, indent=2)
|
| 247 |
+
except Exception as e:
|
| 248 |
+
logger.error(f"写入上下文失败: {e}")
|
| 249 |
+
|
| 250 |
+
async def save_report(self, report: str, language: str = "en") -> None:
|
| 251 |
+
"""保存技术报告 (异步,不阻塞事件循环)"""
|
| 252 |
+
await asyncio.to_thread(self._write_report, report, language)
|
| 253 |
+
|
| 254 |
+
def _write_report(self, report: str, language: str) -> None:
|
| 255 |
+
"""写入报告 (同步,供线程池调用)"""
|
| 256 |
+
try:
|
| 257 |
+
existing = {}
|
| 258 |
+
if os.path.exists(self._context_file):
|
| 259 |
+
with open(self._context_file, 'r', encoding='utf-8') as f:
|
| 260 |
+
existing = json.load(f)
|
| 261 |
+
|
| 262 |
+
if "reports" not in existing:
|
| 263 |
+
existing["reports"] = {}
|
| 264 |
+
existing["reports"][language] = report
|
| 265 |
+
existing["report"] = report
|
| 266 |
+
existing["report_language"] = language
|
| 267 |
+
|
| 268 |
+
with open(self._context_file, 'w', encoding='utf-8') as f:
|
| 269 |
+
json.dump(existing, f, ensure_ascii=False, indent=2)
|
| 270 |
+
logger.info(f"📝 报告已保存: {self.session_id} ({language})")
|
| 271 |
+
except Exception as e:
|
| 272 |
+
logger.error(f"保存报告失败: {e}")
|
| 273 |
+
|
| 274 |
+
def get_report(self, language: str = "en") -> Optional[str]:
|
| 275 |
+
"""
|
| 276 |
+
获取指定语言的报告
|
| 277 |
+
|
| 278 |
+
Args:
|
| 279 |
+
language: 语言代码 ('en', 'zh')
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
报告内容,不存在返回 None
|
| 283 |
+
"""
|
| 284 |
+
context = self.load_context()
|
| 285 |
+
if not context:
|
| 286 |
+
return None
|
| 287 |
+
|
| 288 |
+
# 优先从 reports 字典获取
|
| 289 |
+
reports = context.get("reports", {})
|
| 290 |
+
if language in reports:
|
| 291 |
+
return reports[language]
|
| 292 |
+
|
| 293 |
+
# 兼容旧格式:如果只有 report 字段且语言匹配
|
| 294 |
+
if "report" in context:
|
| 295 |
+
stored_lang = context.get("report_language", "en")
|
| 296 |
+
if stored_lang == language:
|
| 297 |
+
return context["report"]
|
| 298 |
+
|
| 299 |
+
return None
|
| 300 |
+
|
| 301 |
+
def get_available_languages(self) -> List[str]:
|
| 302 |
+
"""获取已有报告的语言列表"""
|
| 303 |
+
context = self.load_context()
|
| 304 |
+
if not context:
|
| 305 |
+
return []
|
| 306 |
+
|
| 307 |
+
reports = context.get("reports", {})
|
| 308 |
+
return list(reports.keys())
|
| 309 |
+
|
| 310 |
+
def load_context(self) -> Optional[Dict[str, Any]]:
|
| 311 |
+
"""
|
| 312 |
+
加载仓库上下文
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
包含 repo_url, global_context, report 等的字典,不存在返回 None
|
| 316 |
+
"""
|
| 317 |
+
if not os.path.exists(self._context_file):
|
| 318 |
+
return None
|
| 319 |
+
|
| 320 |
+
try:
|
| 321 |
+
with open(self._context_file, 'r', encoding='utf-8') as f:
|
| 322 |
+
data = json.load(f)
|
| 323 |
+
|
| 324 |
+
# 恢复内存状态
|
| 325 |
+
self.repo_url = data.get("repo_url")
|
| 326 |
+
self.global_context = data.get("global_context", {})
|
| 327 |
+
|
| 328 |
+
return data
|
| 329 |
+
except Exception as e:
|
| 330 |
+
logger.error(f"加载上下文失败: {e}")
|
| 331 |
+
return None
|
| 332 |
+
|
| 333 |
+
def has_index(self) -> bool:
|
| 334 |
+
"""检查是否已有索引"""
|
| 335 |
+
context = self.load_context()
|
| 336 |
+
return context is not None and context.get("repo_url") is not None
|
| 337 |
+
|
| 338 |
+
async def reset(self) -> None:
|
| 339 |
+
"""重置存储 (分析新仓库时调用)"""
|
| 340 |
+
await self.initialize()
|
| 341 |
+
|
| 342 |
+
# 删除 Qdrant 集合
|
| 343 |
+
if self._qdrant:
|
| 344 |
+
await self._qdrant.delete_collection()
|
| 345 |
+
await self._qdrant.initialize()
|
| 346 |
+
|
| 347 |
+
# 清理本地文件
|
| 348 |
+
for f in [self._context_file, self._cache_file]:
|
| 349 |
+
if os.path.exists(f):
|
| 350 |
+
os.remove(f)
|
| 351 |
+
|
| 352 |
+
# 重置内存状态
|
| 353 |
+
self._bm25 = None
|
| 354 |
+
self._doc_store = []
|
| 355 |
+
self._indexed_files = set()
|
| 356 |
+
self.repo_url = None
|
| 357 |
+
self.global_context = {}
|
| 358 |
+
|
| 359 |
+
logger.info(f"🗑️ 重置存储: {self.session_id}")
|
| 360 |
+
|
| 361 |
+
# 兼容旧接口
|
| 362 |
+
def reset_collection(self) -> None:
|
| 363 |
+
"""同步重置 (兼容旧代码)"""
|
| 364 |
+
asyncio.get_event_loop().run_until_complete(self.reset())
|
| 365 |
+
|
| 366 |
+
async def add_documents(
|
| 367 |
+
self,
|
| 368 |
+
documents: List[str],
|
| 369 |
+
metadatas: List[Dict[str, Any]]
|
| 370 |
+
) -> int:
|
| 371 |
+
"""
|
| 372 |
+
添加文档
|
| 373 |
+
|
| 374 |
+
Args:
|
| 375 |
+
documents: 文档内容列表
|
| 376 |
+
metadatas: 元数据列表
|
| 377 |
+
|
| 378 |
+
Returns:
|
| 379 |
+
成功添加的数量
|
| 380 |
+
"""
|
| 381 |
+
if not documents:
|
| 382 |
+
return 0
|
| 383 |
+
|
| 384 |
+
await self.initialize()
|
| 385 |
+
|
| 386 |
+
# 1. 批量获取 Embedding
|
| 387 |
+
logger.info(f"📊 Embedding: {len(documents)} 个文档")
|
| 388 |
+
embedding_service = get_embedding()
|
| 389 |
+
embeddings = await embedding_service.embed_batch(documents, show_progress=True)
|
| 390 |
+
|
| 391 |
+
# 过滤无效的
|
| 392 |
+
valid_indices = [i for i, emb in enumerate(embeddings) if emb]
|
| 393 |
+
if not valid_indices:
|
| 394 |
+
logger.error("所有 Embedding 都失败了")
|
| 395 |
+
return 0
|
| 396 |
+
|
| 397 |
+
# 2. 构建 Document 对象
|
| 398 |
+
docs = []
|
| 399 |
+
for i in valid_indices:
|
| 400 |
+
doc_id = f"{metadatas[i].get('file', 'unknown')}_{len(self._doc_store) + len(docs)}"
|
| 401 |
+
doc = Document(
|
| 402 |
+
id=doc_id,
|
| 403 |
+
content=documents[i],
|
| 404 |
+
metadata=metadatas[i],
|
| 405 |
+
)
|
| 406 |
+
docs.append(doc)
|
| 407 |
+
|
| 408 |
+
valid_embeddings = [embeddings[i] for i in valid_indices]
|
| 409 |
+
|
| 410 |
+
# 3. 写入 Qdrant
|
| 411 |
+
added = await self._qdrant.add_documents(docs, valid_embeddings)
|
| 412 |
+
|
| 413 |
+
# 4. 更新 BM25 索引 (放入线程池,避免阻塞)
|
| 414 |
+
self._doc_store.extend(docs)
|
| 415 |
+
self._indexed_files.update(doc.file_path for doc in docs)
|
| 416 |
+
|
| 417 |
+
await asyncio.to_thread(self._rebuild_bm25_sync)
|
| 418 |
+
|
| 419 |
+
return added
|
| 420 |
+
|
| 421 |
+
def _rebuild_bm25_sync(self) -> None:
|
| 422 |
+
"""重建 BM25 索引 (同步,用于线程池)"""
|
| 423 |
+
tokenized = [self._tokenize(doc.content) for doc in self._doc_store]
|
| 424 |
+
self._bm25 = BM25Okapi(tokenized)
|
| 425 |
+
self._save_bm25_cache()
|
| 426 |
+
|
| 427 |
+
async def embed_text(self, text: str) -> List[float]:
|
| 428 |
+
"""获取文本 Embedding"""
|
| 429 |
+
embedding_service = get_embedding()
|
| 430 |
+
return await embedding_service.embed_text(text)
|
| 431 |
+
|
| 432 |
+
async def search_hybrid(
|
| 433 |
+
self,
|
| 434 |
+
query: str,
|
| 435 |
+
top_k: int = None
|
| 436 |
+
) -> List[Dict[str, Any]]:
|
| 437 |
+
"""
|
| 438 |
+
混合搜索 (向量 + BM25,RRF 融合)
|
| 439 |
+
|
| 440 |
+
Args:
|
| 441 |
+
query: ���询文本
|
| 442 |
+
top_k: 返回数量
|
| 443 |
+
|
| 444 |
+
Returns:
|
| 445 |
+
搜索结果列表
|
| 446 |
+
"""
|
| 447 |
+
await self.initialize()
|
| 448 |
+
|
| 449 |
+
top_k = top_k or config.default_top_k
|
| 450 |
+
candidate_k = top_k * config.search_oversample
|
| 451 |
+
|
| 452 |
+
# 1. 向量搜索
|
| 453 |
+
vector_results: List[SearchResult] = []
|
| 454 |
+
query_embedding = await self.embed_text(query)
|
| 455 |
+
|
| 456 |
+
if query_embedding and self._qdrant:
|
| 457 |
+
vector_results = await self._qdrant.search(
|
| 458 |
+
query_embedding,
|
| 459 |
+
top_k=candidate_k
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
# 2. BM25 搜索
|
| 463 |
+
bm25_results: List[SearchResult] = []
|
| 464 |
+
if self._bm25 and self._doc_store:
|
| 465 |
+
tokens = self._tokenize(query)
|
| 466 |
+
if not tokens:
|
| 467 |
+
tokens = [""]
|
| 468 |
+
|
| 469 |
+
try:
|
| 470 |
+
scores = self._bm25.get_scores(tokens)
|
| 471 |
+
top_indices = sorted(
|
| 472 |
+
range(len(scores)),
|
| 473 |
+
key=lambda i: scores[i],
|
| 474 |
+
reverse=True
|
| 475 |
+
)[:candidate_k]
|
| 476 |
+
|
| 477 |
+
for idx in top_indices:
|
| 478 |
+
if scores[idx] > 0:
|
| 479 |
+
doc = self._doc_store[idx]
|
| 480 |
+
bm25_results.append(SearchResult(
|
| 481 |
+
document=doc,
|
| 482 |
+
score=scores[idx],
|
| 483 |
+
source="bm25",
|
| 484 |
+
))
|
| 485 |
+
except Exception as e:
|
| 486 |
+
logger.error(f"BM25 搜索失败: {e}")
|
| 487 |
+
|
| 488 |
+
# 3. RRF 融合
|
| 489 |
+
fused = self._rrf_fusion(vector_results, bm25_results)
|
| 490 |
+
|
| 491 |
+
# 4. 格式化输出 (兼容旧接口)
|
| 492 |
+
results = []
|
| 493 |
+
for item in fused[:top_k]:
|
| 494 |
+
doc = item.document
|
| 495 |
+
results.append({
|
| 496 |
+
"id": doc.id,
|
| 497 |
+
"content": doc.content,
|
| 498 |
+
"file": doc.file_path,
|
| 499 |
+
"metadata": doc.metadata,
|
| 500 |
+
"score": item.score,
|
| 501 |
+
})
|
| 502 |
+
|
| 503 |
+
return results
|
| 504 |
+
|
| 505 |
+
def _rrf_fusion(
|
| 506 |
+
self,
|
| 507 |
+
vector_results: List[SearchResult],
|
| 508 |
+
bm25_results: List[SearchResult]
|
| 509 |
+
) -> List[SearchResult]:
|
| 510 |
+
"""RRF (Reciprocal Rank Fusion) 融合"""
|
| 511 |
+
k = config.rrf_k
|
| 512 |
+
fused: Dict[str, Dict] = {}
|
| 513 |
+
|
| 514 |
+
# 向量结果
|
| 515 |
+
for rank, result in enumerate(vector_results):
|
| 516 |
+
doc_id = result.document.id
|
| 517 |
+
if doc_id not in fused:
|
| 518 |
+
fused[doc_id] = {"result": result, "score": 0}
|
| 519 |
+
fused[doc_id]["score"] += config.rrf_weight_vector / (k + rank + 1)
|
| 520 |
+
|
| 521 |
+
# BM25 结果
|
| 522 |
+
for rank, result in enumerate(bm25_results):
|
| 523 |
+
doc_id = result.document.id
|
| 524 |
+
if doc_id not in fused:
|
| 525 |
+
fused[doc_id] = {"result": result, "score": 0}
|
| 526 |
+
fused[doc_id]["score"] += config.rrf_weight_bm25 / (k + rank + 1)
|
| 527 |
+
|
| 528 |
+
# 排序
|
| 529 |
+
sorted_items = sorted(
|
| 530 |
+
fused.values(),
|
| 531 |
+
key=lambda x: x["score"],
|
| 532 |
+
reverse=True
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
return [
|
| 536 |
+
SearchResult(
|
| 537 |
+
document=item["result"].document,
|
| 538 |
+
score=item["score"],
|
| 539 |
+
source="hybrid",
|
| 540 |
+
)
|
| 541 |
+
for item in sorted_items
|
| 542 |
+
]
|
| 543 |
+
|
| 544 |
+
def get_documents_by_file(self, file_path: str) -> List[Dict[str, Any]]:
|
| 545 |
+
"""根据文件路径获取文档 (兼容旧接口)"""
|
| 546 |
+
docs = [
|
| 547 |
+
doc for doc in self._doc_store
|
| 548 |
+
if doc.file_path == file_path
|
| 549 |
+
]
|
| 550 |
+
|
| 551 |
+
result = []
|
| 552 |
+
for doc in sorted(docs, key=lambda d: d.metadata.get("start_line", 0)):
|
| 553 |
+
result.append({
|
| 554 |
+
"id": doc.id,
|
| 555 |
+
"content": doc.content,
|
| 556 |
+
"file": doc.file_path,
|
| 557 |
+
"metadata": doc.metadata,
|
| 558 |
+
"score": 1.0,
|
| 559 |
+
})
|
| 560 |
+
|
| 561 |
+
return result
|
| 562 |
+
|
| 563 |
+
@property
|
| 564 |
+
def indexed_files(self) -> Set[str]:
|
| 565 |
+
"""已索引的文件"""
|
| 566 |
+
return self._indexed_files
|
| 567 |
+
|
| 568 |
+
|
| 569 |
+
# ============================================================
|
| 570 |
+
# 管理器 - LRU Cache + 过期清理
|
| 571 |
+
# ============================================================
|
| 572 |
+
|
| 573 |
+
class SessionEntry:
|
| 574 |
+
"""Session 条目 - 包含存储实例和访问时间"""
|
| 575 |
+
__slots__ = ('store', 'last_access', 'created_at')
|
| 576 |
+
|
| 577 |
+
def __init__(self, store: VectorStore):
|
| 578 |
+
self.store = store
|
| 579 |
+
self.last_access = time.time()
|
| 580 |
+
self.created_at = time.time()
|
| 581 |
+
|
| 582 |
+
def touch(self) -> None:
|
| 583 |
+
"""更新访问时间"""
|
| 584 |
+
self.last_access = time.time()
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
class VectorStoreManager:
|
| 588 |
+
"""
|
| 589 |
+
向量存储管理器 - LRU Cache 实现
|
| 590 |
+
|
| 591 |
+
特性:
|
| 592 |
+
1. LRU 淘汰 - 超过 max_count 时淘汰最久未访问的内存中的 session
|
| 593 |
+
2. 仓库数据永久存储 - 不清理仓库索引和报告
|
| 594 |
+
3. 线程安全 - 使用 asyncio.Lock
|
| 595 |
+
"""
|
| 596 |
+
|
| 597 |
+
def __init__(self, max_count: int = None):
|
| 598 |
+
self._max_count = max_count or config.session_max_count
|
| 599 |
+
self._sessions: Dict[str, SessionEntry] = {}
|
| 600 |
+
self._lock = asyncio.Lock()
|
| 601 |
+
|
| 602 |
+
def get_store(self, session_id: str) -> VectorStore:
|
| 603 |
+
"""
|
| 604 |
+
获取或创建存储实例 (同步接口,兼容现有代码)
|
| 605 |
+
|
| 606 |
+
会触发 LRU 淘汰检查
|
| 607 |
+
"""
|
| 608 |
+
if session_id in self._sessions:
|
| 609 |
+
entry = self._sessions[session_id]
|
| 610 |
+
entry.touch()
|
| 611 |
+
# 移动到最后(模拟 LRU)
|
| 612 |
+
self._sessions.pop(session_id)
|
| 613 |
+
self._sessions[session_id] = entry
|
| 614 |
+
return entry.store
|
| 615 |
+
|
| 616 |
+
# 创建新 session
|
| 617 |
+
store = VectorStore(session_id)
|
| 618 |
+
entry = SessionEntry(store)
|
| 619 |
+
self._sessions[session_id] = entry
|
| 620 |
+
|
| 621 |
+
# 检查是否需要 LRU 淘汰(异步执行)
|
| 622 |
+
if len(self._sessions) > self._max_count:
|
| 623 |
+
asyncio.create_task(self._evict_lru())
|
| 624 |
+
|
| 625 |
+
logger.info(f"📦 Session 创建: {session_id} (总数: {len(self._sessions)})")
|
| 626 |
+
return store
|
| 627 |
+
|
| 628 |
+
async def _evict_lru(self) -> None:
|
| 629 |
+
"""淘汰最久未访问的 session"""
|
| 630 |
+
async with self._lock:
|
| 631 |
+
while len(self._sessions) > self._max_count:
|
| 632 |
+
# 找到最久未访问的
|
| 633 |
+
oldest_id = min(
|
| 634 |
+
self._sessions.keys(),
|
| 635 |
+
key=lambda k: self._sessions[k].last_access
|
| 636 |
+
)
|
| 637 |
+
entry = self._sessions.pop(oldest_id)
|
| 638 |
+
await entry.store.close()
|
| 639 |
+
logger.info(f"🗑️ LRU 淘汰: {oldest_id}")
|
| 640 |
+
|
| 641 |
+
async def close_session(self, session_id: str) -> None:
|
| 642 |
+
"""关闭指定 session"""
|
| 643 |
+
async with self._lock:
|
| 644 |
+
if session_id in self._sessions:
|
| 645 |
+
entry = self._sessions.pop(session_id)
|
| 646 |
+
await entry.store.close()
|
| 647 |
+
logger.info(f"🔒 Session 关闭: {session_id}")
|
| 648 |
+
|
| 649 |
+
async def close_all(self) -> None:
|
| 650 |
+
"""关闭所有连接"""
|
| 651 |
+
async with self._lock:
|
| 652 |
+
for session_id, entry in list(self._sessions.items()):
|
| 653 |
+
await entry.store.close()
|
| 654 |
+
self._sessions.clear()
|
| 655 |
+
logger.info("🔒 所有 Session 已关闭")
|
| 656 |
+
|
| 657 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 658 |
+
"""获取管理器统计信息"""
|
| 659 |
+
now = time.time()
|
| 660 |
+
sessions_info = []
|
| 661 |
+
for sid, entry in self._sessions.items():
|
| 662 |
+
sessions_info.append({
|
| 663 |
+
"session_id": sid,
|
| 664 |
+
"age_hours": round((now - entry.created_at) / 3600, 2),
|
| 665 |
+
"idle_minutes": round((now - entry.last_access) / 60, 2),
|
| 666 |
+
})
|
| 667 |
+
|
| 668 |
+
return {
|
| 669 |
+
"total_sessions": len(self._sessions),
|
| 670 |
+
"max_sessions": self._max_count,
|
| 671 |
+
"sessions": sorted(sessions_info, key=lambda x: x["idle_minutes"], reverse=True)
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
|
| 675 |
+
# 全局管理器
|
| 676 |
+
store_manager = VectorStoreManager()
|
app/storage/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
存储层模块
|
| 4 |
+
|
| 5 |
+
提供向量存储的抽象和实现
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from app.storage.base import (
|
| 9 |
+
Document,
|
| 10 |
+
SearchResult,
|
| 11 |
+
CollectionStats,
|
| 12 |
+
StorageBackend,
|
| 13 |
+
BaseVectorStore,
|
| 14 |
+
)
|
| 15 |
+
from app.storage.qdrant_store import (
|
| 16 |
+
QdrantConfig,
|
| 17 |
+
QdrantVectorStore,
|
| 18 |
+
QdrantStoreFactory,
|
| 19 |
+
get_qdrant_factory,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
__all__ = [
|
| 23 |
+
# 基础类型
|
| 24 |
+
"Document",
|
| 25 |
+
"SearchResult",
|
| 26 |
+
"CollectionStats",
|
| 27 |
+
"StorageBackend",
|
| 28 |
+
"BaseVectorStore",
|
| 29 |
+
# Qdrant
|
| 30 |
+
"QdrantConfig",
|
| 31 |
+
"QdrantVectorStore",
|
| 32 |
+
"QdrantStoreFactory",
|
| 33 |
+
"get_qdrant_factory",
|
| 34 |
+
]
|
app/storage/base.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
向量存储抽象层
|
| 4 |
+
|
| 5 |
+
设计原则:
|
| 6 |
+
1. 接口与实现分离 - 易于切换存储后端
|
| 7 |
+
2. 异步优先 - 所有 I/O 操作都是异步的
|
| 8 |
+
3. 类型安全 - 完整的类型注解
|
| 9 |
+
4. 可观测 - 内置指标收集
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from abc import ABC, abstractmethod
|
| 13 |
+
from dataclasses import dataclass, field
|
| 14 |
+
from typing import List, Dict, Any, Optional, Set
|
| 15 |
+
from enum import Enum
|
| 16 |
+
import logging
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ============================================================
|
| 22 |
+
# 数据模型
|
| 23 |
+
# ============================================================
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class Document:
|
| 27 |
+
"""文档数据模型"""
|
| 28 |
+
id: str
|
| 29 |
+
content: str
|
| 30 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 31 |
+
embedding: Optional[List[float]] = None
|
| 32 |
+
|
| 33 |
+
@property
|
| 34 |
+
def file_path(self) -> str:
|
| 35 |
+
return self.metadata.get("file", "")
|
| 36 |
+
|
| 37 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 38 |
+
return {
|
| 39 |
+
"id": self.id,
|
| 40 |
+
"content": self.content,
|
| 41 |
+
"metadata": self.metadata,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class SearchResult:
|
| 47 |
+
"""搜索结果"""
|
| 48 |
+
document: Document
|
| 49 |
+
score: float
|
| 50 |
+
source: str = "vector" # "vector" | "bm25" | "hybrid"
|
| 51 |
+
|
| 52 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 53 |
+
return {
|
| 54 |
+
"id": self.document.id,
|
| 55 |
+
"content": self.document.content,
|
| 56 |
+
"file": self.document.file_path,
|
| 57 |
+
"metadata": self.document.metadata,
|
| 58 |
+
"score": self.score,
|
| 59 |
+
"source": self.source,
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@dataclass
|
| 64 |
+
class CollectionStats:
|
| 65 |
+
"""集合统计信息"""
|
| 66 |
+
name: str
|
| 67 |
+
document_count: int
|
| 68 |
+
indexed_files: Set[str] = field(default_factory=set)
|
| 69 |
+
vector_dimension: int = 0
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class StorageBackend(Enum):
|
| 73 |
+
"""存储后端类型"""
|
| 74 |
+
QDRANT = "qdrant"
|
| 75 |
+
CHROMA = "chroma" # 保留兼容性
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# ============================================================
|
| 79 |
+
# 抽象基类
|
| 80 |
+
# ============================================================
|
| 81 |
+
|
| 82 |
+
class BaseVectorStore(ABC):
|
| 83 |
+
"""
|
| 84 |
+
向量存储抽象基类
|
| 85 |
+
|
| 86 |
+
所有存储后端必须实现这些方法
|
| 87 |
+
"""
|
| 88 |
+
|
| 89 |
+
@abstractmethod
|
| 90 |
+
async def initialize(self) -> None:
|
| 91 |
+
"""初始化存储连接"""
|
| 92 |
+
pass
|
| 93 |
+
|
| 94 |
+
@abstractmethod
|
| 95 |
+
async def close(self) -> None:
|
| 96 |
+
"""关闭连接"""
|
| 97 |
+
pass
|
| 98 |
+
|
| 99 |
+
@abstractmethod
|
| 100 |
+
async def add_documents(
|
| 101 |
+
self,
|
| 102 |
+
documents: List[Document],
|
| 103 |
+
embeddings: List[List[float]]
|
| 104 |
+
) -> int:
|
| 105 |
+
"""
|
| 106 |
+
添加文档
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
documents: 文档列表
|
| 110 |
+
embeddings: 对应的嵌入向量
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
成功添加的文档数量
|
| 114 |
+
"""
|
| 115 |
+
pass
|
| 116 |
+
|
| 117 |
+
@abstractmethod
|
| 118 |
+
async def search(
|
| 119 |
+
self,
|
| 120 |
+
query_embedding: List[float],
|
| 121 |
+
top_k: int = 10,
|
| 122 |
+
filter_conditions: Optional[Dict[str, Any]] = None
|
| 123 |
+
) -> List[SearchResult]:
|
| 124 |
+
"""
|
| 125 |
+
向量相似度搜索
|
| 126 |
+
|
| 127 |
+
Args:
|
| 128 |
+
query_embedding: 查询向量
|
| 129 |
+
top_k: 返回数量
|
| 130 |
+
filter_conditions: 过滤条件
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
搜索结果列表
|
| 134 |
+
"""
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
@abstractmethod
|
| 138 |
+
async def delete_collection(self) -> bool:
|
| 139 |
+
"""删除当前集合"""
|
| 140 |
+
pass
|
| 141 |
+
|
| 142 |
+
@abstractmethod
|
| 143 |
+
async def get_stats(self) -> CollectionStats:
|
| 144 |
+
"""获取集合统计信息"""
|
| 145 |
+
pass
|
| 146 |
+
|
| 147 |
+
@abstractmethod
|
| 148 |
+
async def get_documents_by_file(self, file_path: str) -> List[Document]:
|
| 149 |
+
"""根据文件路径获取文档"""
|
| 150 |
+
pass
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class BaseVectorStoreFactory(ABC):
|
| 154 |
+
"""向量存储工厂基类"""
|
| 155 |
+
|
| 156 |
+
@abstractmethod
|
| 157 |
+
def create(self, collection_name: str) -> BaseVectorStore:
|
| 158 |
+
"""创建存储实例"""
|
| 159 |
+
pass
|
app/storage/qdrant_store.py
ADDED
|
@@ -0,0 +1,578 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Qdrant 向量存储实现
|
| 4 |
+
|
| 5 |
+
特性:
|
| 6 |
+
1. 异步原生 - 使用 qdrant-client AsyncQdrantClient
|
| 7 |
+
2. 高性能 - 批量 upsert、HNSW 索引、payload 索引
|
| 8 |
+
3. 混合搜索 - 向量 + 稀疏向量 (FastEmbed)
|
| 9 |
+
4. 连接池 - gRPC 长连接复用
|
| 10 |
+
5. 可观测 - 完整的日志和指标
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import asyncio
|
| 14 |
+
import logging
|
| 15 |
+
import os
|
| 16 |
+
from dataclasses import dataclass
|
| 17 |
+
from typing import List, Dict, Any, Optional, Set
|
| 18 |
+
from contextlib import asynccontextmanager
|
| 19 |
+
|
| 20 |
+
from qdrant_client import AsyncQdrantClient, models
|
| 21 |
+
from qdrant_client.models import (
|
| 22 |
+
Distance,
|
| 23 |
+
VectorParams,
|
| 24 |
+
PointStruct,
|
| 25 |
+
Filter,
|
| 26 |
+
FieldCondition,
|
| 27 |
+
MatchValue,
|
| 28 |
+
PayloadSchemaType,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
from app.storage.base import (
|
| 32 |
+
BaseVectorStore,
|
| 33 |
+
Document,
|
| 34 |
+
SearchResult,
|
| 35 |
+
CollectionStats,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
logger = logging.getLogger(__name__)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ============================================================
|
| 42 |
+
# 配置
|
| 43 |
+
# ============================================================
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class QdrantConfig:
|
| 47 |
+
"""
|
| 48 |
+
Qdrant 配置
|
| 49 |
+
|
| 50 |
+
支持三种模式:
|
| 51 |
+
- local: 本地嵌入式 (开发/单进程)
|
| 52 |
+
- server: Qdrant Server (多 Worker 生产环境)
|
| 53 |
+
- cloud: Qdrant Cloud (托管服务)
|
| 54 |
+
|
| 55 |
+
环境变量:
|
| 56 |
+
- QDRANT_MODE: "local" | "server" | "cloud"
|
| 57 |
+
- QDRANT_URL: 服务器地址 (server/cloud 模式)
|
| 58 |
+
- QDRANT_API_KEY: API 密钥 (cloud 模式必需)
|
| 59 |
+
- QDRANT_LOCAL_PATH: 本地存储路径 (local 模式)
|
| 60 |
+
"""
|
| 61 |
+
# 模式: "local" | "server" | "cloud"
|
| 62 |
+
mode: str = "local"
|
| 63 |
+
|
| 64 |
+
# Server/Cloud 模式配置
|
| 65 |
+
url: Optional[str] = None
|
| 66 |
+
host: str = "localhost"
|
| 67 |
+
port: int = 6333
|
| 68 |
+
grpc_port: int = 6334
|
| 69 |
+
prefer_grpc: bool = True
|
| 70 |
+
api_key: Optional[str] = None
|
| 71 |
+
|
| 72 |
+
# Local 模式配置
|
| 73 |
+
local_path: str = "data/qdrant_db"
|
| 74 |
+
|
| 75 |
+
# 向量配置
|
| 76 |
+
vector_size: int = 1024 # BGE-M3 维度
|
| 77 |
+
distance: Distance = Distance.COSINE
|
| 78 |
+
|
| 79 |
+
# 索引配置
|
| 80 |
+
hnsw_m: int = 16 # HNSW 图的边数
|
| 81 |
+
hnsw_ef_construct: int = 100 # 构建时的搜索深度
|
| 82 |
+
|
| 83 |
+
# 批量操作
|
| 84 |
+
batch_size: int = 100
|
| 85 |
+
|
| 86 |
+
# 超时
|
| 87 |
+
timeout: float = 30.0
|
| 88 |
+
|
| 89 |
+
@classmethod
|
| 90 |
+
def from_env(cls) -> "QdrantConfig":
|
| 91 |
+
"""从环境变量加载配置"""
|
| 92 |
+
mode = os.getenv("QDRANT_MODE", "local").lower()
|
| 93 |
+
|
| 94 |
+
return cls(
|
| 95 |
+
mode=mode,
|
| 96 |
+
url=os.getenv("QDRANT_URL"),
|
| 97 |
+
host=os.getenv("QDRANT_HOST", "localhost"),
|
| 98 |
+
port=int(os.getenv("QDRANT_PORT", "6333")),
|
| 99 |
+
grpc_port=int(os.getenv("QDRANT_GRPC_PORT", "6334")),
|
| 100 |
+
api_key=os.getenv("QDRANT_API_KEY"),
|
| 101 |
+
local_path=os.getenv("QDRANT_LOCAL_PATH", "data/qdrant_db"),
|
| 102 |
+
vector_size=int(os.getenv("QDRANT_VECTOR_SIZE", "1024")),
|
| 103 |
+
prefer_grpc=os.getenv("QDRANT_PREFER_GRPC", "true").lower() == "true",
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
@property
|
| 107 |
+
def is_local(self) -> bool:
|
| 108 |
+
return self.mode == "local"
|
| 109 |
+
|
| 110 |
+
@property
|
| 111 |
+
def is_server(self) -> bool:
|
| 112 |
+
return self.mode == "server"
|
| 113 |
+
|
| 114 |
+
@property
|
| 115 |
+
def is_cloud(self) -> bool:
|
| 116 |
+
return self.mode == "cloud"
|
| 117 |
+
|
| 118 |
+
def validate(self) -> None:
|
| 119 |
+
"""验证配置"""
|
| 120 |
+
if self.is_cloud and not self.api_key:
|
| 121 |
+
raise ValueError("QDRANT_API_KEY is required for cloud mode")
|
| 122 |
+
if (self.is_server or self.is_cloud) and not (self.url or self.host):
|
| 123 |
+
raise ValueError("QDRANT_URL or QDRANT_HOST is required for server/cloud mode")
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# ============================================================
|
| 127 |
+
# 全局共享客户端单例
|
| 128 |
+
# ============================================================
|
| 129 |
+
|
| 130 |
+
_shared_client: Optional[AsyncQdrantClient] = None
|
| 131 |
+
_shared_config: Optional[QdrantConfig] = None
|
| 132 |
+
_client_lock = asyncio.Lock()
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
async def get_shared_client(config: Optional[QdrantConfig] = None) -> AsyncQdrantClient:
|
| 136 |
+
"""
|
| 137 |
+
获取共享的 Qdrant 客户端单例
|
| 138 |
+
|
| 139 |
+
支持三种模式:
|
| 140 |
+
- local: 本地嵌入式存储 (单进程,开发环境)
|
| 141 |
+
- server: Qdrant Server (多 Worker,Docker 部署)
|
| 142 |
+
- cloud: Qdrant Cloud (托管服务)
|
| 143 |
+
"""
|
| 144 |
+
global _shared_client, _shared_config
|
| 145 |
+
|
| 146 |
+
async with _client_lock:
|
| 147 |
+
if _shared_client is None:
|
| 148 |
+
_shared_config = config or QdrantConfig.from_env()
|
| 149 |
+
_shared_config.validate()
|
| 150 |
+
|
| 151 |
+
if _shared_config.is_local:
|
| 152 |
+
# Local 模式: 嵌入式存储
|
| 153 |
+
os.makedirs(_shared_config.local_path, exist_ok=True)
|
| 154 |
+
_shared_client = AsyncQdrantClient(
|
| 155 |
+
path=_shared_config.local_path,
|
| 156 |
+
timeout=_shared_config.timeout,
|
| 157 |
+
)
|
| 158 |
+
logger.info(f"📦 Qdrant 本地模式: {_shared_config.local_path}")
|
| 159 |
+
|
| 160 |
+
elif _shared_config.is_server:
|
| 161 |
+
# Server 模式: 连接 Qdrant Server
|
| 162 |
+
if _shared_config.url:
|
| 163 |
+
_shared_client = AsyncQdrantClient(
|
| 164 |
+
url=_shared_config.url,
|
| 165 |
+
prefer_grpc=_shared_config.prefer_grpc,
|
| 166 |
+
timeout=_shared_config.timeout,
|
| 167 |
+
)
|
| 168 |
+
logger.info(f"🌐 Qdrant Server 模式: {_shared_config.url}")
|
| 169 |
+
else:
|
| 170 |
+
_shared_client = AsyncQdrantClient(
|
| 171 |
+
host=_shared_config.host,
|
| 172 |
+
port=_shared_config.port,
|
| 173 |
+
grpc_port=_shared_config.grpc_port,
|
| 174 |
+
prefer_grpc=_shared_config.prefer_grpc,
|
| 175 |
+
timeout=_shared_config.timeout,
|
| 176 |
+
)
|
| 177 |
+
logger.info(f"🌐 Qdrant Server 模式: {_shared_config.host}:{_shared_config.port}")
|
| 178 |
+
|
| 179 |
+
else:
|
| 180 |
+
# Cloud 模式: 连接 Qdrant Cloud
|
| 181 |
+
_shared_client = AsyncQdrantClient(
|
| 182 |
+
url=_shared_config.url,
|
| 183 |
+
api_key=_shared_config.api_key,
|
| 184 |
+
timeout=_shared_config.timeout,
|
| 185 |
+
)
|
| 186 |
+
logger.info(f"☁️ Qdrant Cloud 模式: {_shared_config.url}")
|
| 187 |
+
|
| 188 |
+
return _shared_client
|
| 189 |
+
|
| 190 |
+
return _shared_client
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
async def close_shared_client() -> None:
|
| 194 |
+
"""关闭共享客户端"""
|
| 195 |
+
global _shared_client
|
| 196 |
+
if _shared_client is not None:
|
| 197 |
+
await _shared_client.close()
|
| 198 |
+
_shared_client = None
|
| 199 |
+
logger.info("🔒 Qdrant 共享客户端已关闭")
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# ============================================================
|
| 203 |
+
# Qdrant 存储实现
|
| 204 |
+
# ============================================================
|
| 205 |
+
|
| 206 |
+
class QdrantVectorStore(BaseVectorStore):
|
| 207 |
+
"""
|
| 208 |
+
Qdrant 向量存储
|
| 209 |
+
|
| 210 |
+
使用示例:
|
| 211 |
+
```python
|
| 212 |
+
config = QdrantConfig.from_env()
|
| 213 |
+
store = QdrantVectorStore("my_collection", config)
|
| 214 |
+
|
| 215 |
+
await store.initialize()
|
| 216 |
+
|
| 217 |
+
# 添加文档
|
| 218 |
+
docs = [Document(id="1", content="hello", metadata={"file": "a.py"})]
|
| 219 |
+
embeddings = [[0.1, 0.2, ...]]
|
| 220 |
+
await store.add_documents(docs, embeddings)
|
| 221 |
+
|
| 222 |
+
# 搜索
|
| 223 |
+
results = await store.search(query_embedding, top_k=5)
|
| 224 |
+
|
| 225 |
+
await store.close()
|
| 226 |
+
```
|
| 227 |
+
"""
|
| 228 |
+
|
| 229 |
+
# Payload 字段名常量
|
| 230 |
+
FIELD_CONTENT = "content"
|
| 231 |
+
FIELD_FILE = "file"
|
| 232 |
+
FIELD_METADATA = "metadata"
|
| 233 |
+
|
| 234 |
+
def __init__(
|
| 235 |
+
self,
|
| 236 |
+
collection_name: str,
|
| 237 |
+
config: Optional[QdrantConfig] = None
|
| 238 |
+
):
|
| 239 |
+
self.collection_name = self._sanitize_name(collection_name)
|
| 240 |
+
self.config = config or QdrantConfig.from_env()
|
| 241 |
+
self._initialized = False
|
| 242 |
+
|
| 243 |
+
@staticmethod
|
| 244 |
+
def _sanitize_name(name: str) -> str:
|
| 245 |
+
"""清理集合名称"""
|
| 246 |
+
import re
|
| 247 |
+
clean = re.sub(r'[^a-zA-Z0-9_-]', '_', name)
|
| 248 |
+
return clean[:63] if clean else "default"
|
| 249 |
+
|
| 250 |
+
async def _get_client(self) -> AsyncQdrantClient:
|
| 251 |
+
"""获取共享客户端 (解决 Qdrant Local 并发访问问题)"""
|
| 252 |
+
return await get_shared_client(self.config)
|
| 253 |
+
|
| 254 |
+
async def initialize(self) -> None:
|
| 255 |
+
"""初始化集合"""
|
| 256 |
+
if self._initialized:
|
| 257 |
+
return
|
| 258 |
+
|
| 259 |
+
client = await self._get_client()
|
| 260 |
+
|
| 261 |
+
# 检查集合是否存在
|
| 262 |
+
collections = await client.get_collections()
|
| 263 |
+
exists = any(c.name == self.collection_name for c in collections.collections)
|
| 264 |
+
|
| 265 |
+
if not exists:
|
| 266 |
+
# 创建集合
|
| 267 |
+
await client.create_collection(
|
| 268 |
+
collection_name=self.collection_name,
|
| 269 |
+
vectors_config=VectorParams(
|
| 270 |
+
size=self.config.vector_size,
|
| 271 |
+
distance=self.config.distance,
|
| 272 |
+
hnsw_config=models.HnswConfigDiff(
|
| 273 |
+
m=self.config.hnsw_m,
|
| 274 |
+
ef_construct=self.config.hnsw_ef_construct,
|
| 275 |
+
),
|
| 276 |
+
),
|
| 277 |
+
# 启用 payload 索引以加速过滤
|
| 278 |
+
optimizers_config=models.OptimizersConfigDiff(
|
| 279 |
+
indexing_threshold=0, # 立即索引
|
| 280 |
+
),
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
# 创建 payload 索引
|
| 284 |
+
await client.create_payload_index(
|
| 285 |
+
collection_name=self.collection_name,
|
| 286 |
+
field_name=self.FIELD_FILE,
|
| 287 |
+
field_schema=PayloadSchemaType.KEYWORD,
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
logger.info(f"✅ 创建集合: {self.collection_name}")
|
| 291 |
+
else:
|
| 292 |
+
logger.debug(f"📂 集合已存在: {self.collection_name}")
|
| 293 |
+
|
| 294 |
+
self._initialized = True
|
| 295 |
+
|
| 296 |
+
async def close(self) -> None:
|
| 297 |
+
"""
|
| 298 |
+
关闭连接 (使用共享客户端时不实际关闭)
|
| 299 |
+
|
| 300 |
+
注意: 由于使用共享客户端,单个 Store 的 close() 不会关闭客户端。
|
| 301 |
+
全局关闭请使用 close_shared_client()
|
| 302 |
+
"""
|
| 303 |
+
self._initialized = False
|
| 304 |
+
logger.debug(f"🔌 Store 已关闭: {self.collection_name}")
|
| 305 |
+
|
| 306 |
+
async def add_documents(
|
| 307 |
+
self,
|
| 308 |
+
documents: List[Document],
|
| 309 |
+
embeddings: List[List[float]]
|
| 310 |
+
) -> int:
|
| 311 |
+
"""批量添加文档"""
|
| 312 |
+
if not documents or not embeddings:
|
| 313 |
+
return 0
|
| 314 |
+
|
| 315 |
+
if len(documents) != len(embeddings):
|
| 316 |
+
raise ValueError(f"文档数量 ({len(documents)}) 与向量数量 ({len(embeddings)}) 不匹配")
|
| 317 |
+
|
| 318 |
+
await self.initialize()
|
| 319 |
+
client = await self._get_client()
|
| 320 |
+
|
| 321 |
+
# 过滤空向量
|
| 322 |
+
valid_pairs = [
|
| 323 |
+
(doc, emb) for doc, emb in zip(documents, embeddings)
|
| 324 |
+
if emb and len(emb) == self.config.vector_size
|
| 325 |
+
]
|
| 326 |
+
|
| 327 |
+
if not valid_pairs:
|
| 328 |
+
logger.warning("没有有效的文档向量对")
|
| 329 |
+
return 0
|
| 330 |
+
|
| 331 |
+
# 构建 Points
|
| 332 |
+
points = []
|
| 333 |
+
for doc, embedding in valid_pairs:
|
| 334 |
+
point = PointStruct(
|
| 335 |
+
id=self._generate_point_id(doc.id),
|
| 336 |
+
vector=embedding,
|
| 337 |
+
payload={
|
| 338 |
+
self.FIELD_CONTENT: doc.content,
|
| 339 |
+
self.FIELD_FILE: doc.file_path,
|
| 340 |
+
self.FIELD_METADATA: doc.metadata,
|
| 341 |
+
"doc_id": doc.id,
|
| 342 |
+
},
|
| 343 |
+
)
|
| 344 |
+
points.append(point)
|
| 345 |
+
|
| 346 |
+
# 批量 upsert
|
| 347 |
+
total_added = 0
|
| 348 |
+
batch_size = self.config.batch_size
|
| 349 |
+
|
| 350 |
+
for i in range(0, len(points), batch_size):
|
| 351 |
+
batch = points[i:i + batch_size]
|
| 352 |
+
try:
|
| 353 |
+
await client.upsert(
|
| 354 |
+
collection_name=self.collection_name,
|
| 355 |
+
points=batch,
|
| 356 |
+
wait=True,
|
| 357 |
+
)
|
| 358 |
+
total_added += len(batch)
|
| 359 |
+
except Exception as e:
|
| 360 |
+
logger.error(f"批次 {i // batch_size + 1} 写入失败: {e}")
|
| 361 |
+
|
| 362 |
+
logger.info(f"✅ 写入 {total_added}/{len(points)} 个文档到 {self.collection_name}")
|
| 363 |
+
return total_added
|
| 364 |
+
|
| 365 |
+
def _generate_point_id(self, doc_id: str) -> int:
|
| 366 |
+
"""生成数值型 Point ID (Qdrant 要求)"""
|
| 367 |
+
import hashlib
|
| 368 |
+
hash_bytes = hashlib.sha256(doc_id.encode()).digest()
|
| 369 |
+
# 取前 8 字节转为正整数
|
| 370 |
+
return int.from_bytes(hash_bytes[:8], byteorder='big') & 0x7FFFFFFFFFFFFFFF
|
| 371 |
+
|
| 372 |
+
async def search(
|
| 373 |
+
self,
|
| 374 |
+
query_embedding: List[float],
|
| 375 |
+
top_k: int = 10,
|
| 376 |
+
filter_conditions: Optional[Dict[str, Any]] = None
|
| 377 |
+
) -> List[SearchResult]:
|
| 378 |
+
"""向量相似度搜索"""
|
| 379 |
+
if not query_embedding:
|
| 380 |
+
return []
|
| 381 |
+
|
| 382 |
+
await self.initialize()
|
| 383 |
+
client = await self._get_client()
|
| 384 |
+
|
| 385 |
+
# 构建过滤器
|
| 386 |
+
query_filter = None
|
| 387 |
+
if filter_conditions:
|
| 388 |
+
must_conditions = []
|
| 389 |
+
for field, value in filter_conditions.items():
|
| 390 |
+
must_conditions.append(
|
| 391 |
+
FieldCondition(
|
| 392 |
+
key=field,
|
| 393 |
+
match=MatchValue(value=value),
|
| 394 |
+
)
|
| 395 |
+
)
|
| 396 |
+
query_filter = Filter(must=must_conditions)
|
| 397 |
+
|
| 398 |
+
try:
|
| 399 |
+
# 使用 query_points (qdrant-client >= 1.7.0)
|
| 400 |
+
results = await client.query_points(
|
| 401 |
+
collection_name=self.collection_name,
|
| 402 |
+
query=query_embedding,
|
| 403 |
+
limit=top_k,
|
| 404 |
+
query_filter=query_filter,
|
| 405 |
+
with_payload=True,
|
| 406 |
+
score_threshold=0.0,
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
search_results = []
|
| 410 |
+
for hit in results.points:
|
| 411 |
+
payload = hit.payload or {}
|
| 412 |
+
doc = Document(
|
| 413 |
+
id=payload.get("doc_id", str(hit.id)),
|
| 414 |
+
content=payload.get(self.FIELD_CONTENT, ""),
|
| 415 |
+
metadata=payload.get(self.FIELD_METADATA, {}),
|
| 416 |
+
)
|
| 417 |
+
search_results.append(SearchResult(
|
| 418 |
+
document=doc,
|
| 419 |
+
score=hit.score,
|
| 420 |
+
source="vector",
|
| 421 |
+
))
|
| 422 |
+
|
| 423 |
+
return search_results
|
| 424 |
+
|
| 425 |
+
except Exception as e:
|
| 426 |
+
logger.error(f"搜索失败: {e}")
|
| 427 |
+
return []
|
| 428 |
+
|
| 429 |
+
async def delete_collection(self) -> bool:
|
| 430 |
+
"""删除集合"""
|
| 431 |
+
try:
|
| 432 |
+
client = await self._get_client()
|
| 433 |
+
await client.delete_collection(self.collection_name)
|
| 434 |
+
self._initialized = False
|
| 435 |
+
logger.info(f"🗑️ 删除集合: {self.collection_name}")
|
| 436 |
+
return True
|
| 437 |
+
except Exception as e:
|
| 438 |
+
logger.error(f"删除集合失败: {e}")
|
| 439 |
+
return False
|
| 440 |
+
|
| 441 |
+
async def get_stats(self) -> CollectionStats:
|
| 442 |
+
"""获取集合统计"""
|
| 443 |
+
await self.initialize()
|
| 444 |
+
client = await self._get_client()
|
| 445 |
+
|
| 446 |
+
try:
|
| 447 |
+
info = await client.get_collection(self.collection_name)
|
| 448 |
+
|
| 449 |
+
# 获取所有唯一文件
|
| 450 |
+
indexed_files: Set[str] = set()
|
| 451 |
+
scroll_result = await client.scroll(
|
| 452 |
+
collection_name=self.collection_name,
|
| 453 |
+
limit=10000,
|
| 454 |
+
with_payload=[self.FIELD_FILE],
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
for point in scroll_result[0]:
|
| 458 |
+
if point.payload:
|
| 459 |
+
file_path = point.payload.get(self.FIELD_FILE)
|
| 460 |
+
if file_path:
|
| 461 |
+
indexed_files.add(file_path)
|
| 462 |
+
|
| 463 |
+
return CollectionStats(
|
| 464 |
+
name=self.collection_name,
|
| 465 |
+
document_count=info.points_count or 0,
|
| 466 |
+
indexed_files=indexed_files,
|
| 467 |
+
vector_dimension=self.config.vector_size,
|
| 468 |
+
)
|
| 469 |
+
except Exception as e:
|
| 470 |
+
logger.error(f"获取统计失败: {e}")
|
| 471 |
+
return CollectionStats(name=self.collection_name, document_count=0)
|
| 472 |
+
|
| 473 |
+
async def get_documents_by_file(self, file_path: str) -> List[Document]:
|
| 474 |
+
"""根据文件路径获取文档"""
|
| 475 |
+
await self.initialize()
|
| 476 |
+
client = await self._get_client()
|
| 477 |
+
|
| 478 |
+
try:
|
| 479 |
+
scroll_result = await client.scroll(
|
| 480 |
+
collection_name=self.collection_name,
|
| 481 |
+
scroll_filter=Filter(
|
| 482 |
+
must=[
|
| 483 |
+
FieldCondition(
|
| 484 |
+
key=self.FIELD_FILE,
|
| 485 |
+
match=MatchValue(value=file_path),
|
| 486 |
+
)
|
| 487 |
+
]
|
| 488 |
+
),
|
| 489 |
+
limit=1000,
|
| 490 |
+
with_payload=True,
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
documents = []
|
| 494 |
+
for point in scroll_result[0]:
|
| 495 |
+
payload = point.payload or {}
|
| 496 |
+
doc = Document(
|
| 497 |
+
id=payload.get("doc_id", str(point.id)),
|
| 498 |
+
content=payload.get(self.FIELD_CONTENT, ""),
|
| 499 |
+
metadata=payload.get(self.FIELD_METADATA, {}),
|
| 500 |
+
)
|
| 501 |
+
documents.append(doc)
|
| 502 |
+
|
| 503 |
+
# 按行号排序
|
| 504 |
+
documents.sort(key=lambda d: d.metadata.get("start_line", 0))
|
| 505 |
+
return documents
|
| 506 |
+
|
| 507 |
+
except Exception as e:
|
| 508 |
+
logger.error(f"获取文件文档失败: {e}")
|
| 509 |
+
return []
|
| 510 |
+
|
| 511 |
+
async def get_all_documents(self) -> List[Document]:
|
| 512 |
+
"""获取所有文档 (用于 BM25 索引构建)"""
|
| 513 |
+
await self.initialize()
|
| 514 |
+
client = await self._get_client()
|
| 515 |
+
|
| 516 |
+
documents = []
|
| 517 |
+
offset = None
|
| 518 |
+
|
| 519 |
+
try:
|
| 520 |
+
while True:
|
| 521 |
+
scroll_result = await client.scroll(
|
| 522 |
+
collection_name=self.collection_name,
|
| 523 |
+
limit=1000,
|
| 524 |
+
offset=offset,
|
| 525 |
+
with_payload=True,
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
points, next_offset = scroll_result
|
| 529 |
+
|
| 530 |
+
for point in points:
|
| 531 |
+
payload = point.payload or {}
|
| 532 |
+
doc = Document(
|
| 533 |
+
id=payload.get("doc_id", str(point.id)),
|
| 534 |
+
content=payload.get(self.FIELD_CONTENT, ""),
|
| 535 |
+
metadata=payload.get(self.FIELD_METADATA, {}),
|
| 536 |
+
)
|
| 537 |
+
documents.append(doc)
|
| 538 |
+
|
| 539 |
+
if next_offset is None:
|
| 540 |
+
break
|
| 541 |
+
offset = next_offset
|
| 542 |
+
|
| 543 |
+
return documents
|
| 544 |
+
|
| 545 |
+
except Exception as e:
|
| 546 |
+
logger.error(f"获取所有文档失败: {e}")
|
| 547 |
+
return []
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
# ============================================================
|
| 551 |
+
# 工厂
|
| 552 |
+
# ============================================================
|
| 553 |
+
|
| 554 |
+
class QdrantStoreFactory:
|
| 555 |
+
"""Qdrant 存储工厂"""
|
| 556 |
+
|
| 557 |
+
def __init__(self, config: Optional[QdrantConfig] = None):
|
| 558 |
+
self.config = config or QdrantConfig.from_env()
|
| 559 |
+
|
| 560 |
+
def create(self, collection_name: str) -> QdrantVectorStore:
|
| 561 |
+
"""创建存储实例"""
|
| 562 |
+
return QdrantVectorStore(collection_name, self.config)
|
| 563 |
+
|
| 564 |
+
async def get_client(self) -> AsyncQdrantClient:
|
| 565 |
+
"""获取共享的 Qdrant 客户端"""
|
| 566 |
+
return await get_shared_client(self.config)
|
| 567 |
+
|
| 568 |
+
|
| 569 |
+
# 全局工厂实例
|
| 570 |
+
_qdrant_factory: Optional[QdrantStoreFactory] = None
|
| 571 |
+
|
| 572 |
+
|
| 573 |
+
def get_qdrant_factory(config: Optional[QdrantConfig] = None) -> QdrantStoreFactory:
|
| 574 |
+
"""获取工厂单例"""
|
| 575 |
+
global _qdrant_factory
|
| 576 |
+
if _qdrant_factory is None:
|
| 577 |
+
_qdrant_factory = QdrantStoreFactory(config)
|
| 578 |
+
return _qdrant_factory
|
app/utils/embedding.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Embedding 服务 - 并发优化版
|
| 4 |
+
|
| 5 |
+
特性:
|
| 6 |
+
1. 并发批量请求 - 使用 asyncio.gather 并行处理多个批次
|
| 7 |
+
2. 信号量控制 - 限制最大并发数,避免 API 限流
|
| 8 |
+
3. 重试机制 - 使用 tenacity 处理临时性错误
|
| 9 |
+
4. 智能分批 - 根据 token 数量动态调整批次大小
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import asyncio
|
| 13 |
+
import logging
|
| 14 |
+
from typing import List, Optional
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
|
| 17 |
+
from openai import AsyncOpenAI
|
| 18 |
+
|
| 19 |
+
from app.core.config import settings
|
| 20 |
+
from app.utils.retry import llm_retry, is_retryable_error
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class EmbeddingConfig:
|
| 27 |
+
"""Embedding 服务配置"""
|
| 28 |
+
# API 配置
|
| 29 |
+
api_base_url: str = "https://api.siliconflow.cn/v1"
|
| 30 |
+
model_name: str = "BAAI/bge-m3"
|
| 31 |
+
|
| 32 |
+
# 批处理配置
|
| 33 |
+
batch_size: int = 50 # 每批文本数量
|
| 34 |
+
max_text_length: int = 8000 # 单个文本最大字符数
|
| 35 |
+
|
| 36 |
+
# 并发控制
|
| 37 |
+
max_concurrent_batches: int = 5 # 最大并发批次数
|
| 38 |
+
|
| 39 |
+
# 超时配置
|
| 40 |
+
timeout: int = 60 # 单次请求超时 (秒)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class EmbeddingService:
|
| 44 |
+
"""
|
| 45 |
+
高性能 Embedding 服务
|
| 46 |
+
|
| 47 |
+
使用示例:
|
| 48 |
+
```python
|
| 49 |
+
service = EmbeddingService()
|
| 50 |
+
|
| 51 |
+
# 单文本
|
| 52 |
+
embedding = await service.embed_text("Hello world")
|
| 53 |
+
|
| 54 |
+
# 批量文本 (自动并发优化)
|
| 55 |
+
texts = ["text1", "text2", ..., "text100"]
|
| 56 |
+
embeddings = await service.embed_batch(texts)
|
| 57 |
+
```
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
def __init__(self, config: Optional[EmbeddingConfig] = None):
|
| 61 |
+
self.config = config or EmbeddingConfig()
|
| 62 |
+
|
| 63 |
+
# 初始化 OpenAI 客户端 (SiliconFlow 兼容 OpenAI 协议)
|
| 64 |
+
self._client = AsyncOpenAI(
|
| 65 |
+
api_key=settings.SILICON_API_KEY,
|
| 66 |
+
base_url=self.config.api_base_url,
|
| 67 |
+
timeout=self.config.timeout
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# 并发信号量
|
| 71 |
+
self._semaphore = asyncio.Semaphore(self.config.max_concurrent_batches)
|
| 72 |
+
|
| 73 |
+
# 统计信息
|
| 74 |
+
self._stats = {
|
| 75 |
+
"total_requests": 0,
|
| 76 |
+
"successful_requests": 0,
|
| 77 |
+
"failed_requests": 0,
|
| 78 |
+
"total_texts": 0,
|
| 79 |
+
"retried_requests": 0
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
def _preprocess_text(self, text: str) -> str:
|
| 83 |
+
"""预处理文本: 移除换行、截断长度"""
|
| 84 |
+
text = text.replace("\n", " ").strip()
|
| 85 |
+
if len(text) > self.config.max_text_length:
|
| 86 |
+
text = text[:self.config.max_text_length]
|
| 87 |
+
return text
|
| 88 |
+
|
| 89 |
+
@llm_retry
|
| 90 |
+
async def _embed_single_batch(self, texts: List[str]) -> List[List[float]]:
|
| 91 |
+
"""
|
| 92 |
+
处理单个批次的 Embedding 请求 (带重试)
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
texts: 预处理后的文本列表
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
embedding 向量列表
|
| 99 |
+
"""
|
| 100 |
+
self._stats["total_requests"] += 1
|
| 101 |
+
|
| 102 |
+
response = await self._client.embeddings.create(
|
| 103 |
+
input=texts,
|
| 104 |
+
model=self.config.model_name
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
self._stats["successful_requests"] += 1
|
| 108 |
+
return [item.embedding for item in response.data]
|
| 109 |
+
|
| 110 |
+
async def _embed_batch_with_semaphore(
|
| 111 |
+
self,
|
| 112 |
+
batch_texts: List[str],
|
| 113 |
+
batch_index: int
|
| 114 |
+
) -> tuple[int, List[List[float]]]:
|
| 115 |
+
"""
|
| 116 |
+
带信号量控制的批次处理
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
(batch_index, embeddings) - 返回索引用于结果排序
|
| 120 |
+
"""
|
| 121 |
+
async with self._semaphore:
|
| 122 |
+
try:
|
| 123 |
+
embeddings = await self._embed_single_batch(batch_texts)
|
| 124 |
+
logger.debug(f"✅ 批次 {batch_index} 完成: {len(batch_texts)} 文本")
|
| 125 |
+
return (batch_index, embeddings)
|
| 126 |
+
except Exception as e:
|
| 127 |
+
self._stats["failed_requests"] += 1
|
| 128 |
+
logger.error(f"❌ 批次 {batch_index} 失败: {type(e).__name__}: {e}")
|
| 129 |
+
raise
|
| 130 |
+
|
| 131 |
+
async def embed_text(self, text: str) -> List[float]:
|
| 132 |
+
"""
|
| 133 |
+
获取单个文本的 Embedding
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
text: 输入文本
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
embedding 向量,失败返回空列表
|
| 140 |
+
"""
|
| 141 |
+
try:
|
| 142 |
+
processed = self._preprocess_text(text)
|
| 143 |
+
if not processed:
|
| 144 |
+
return []
|
| 145 |
+
|
| 146 |
+
self._stats["total_texts"] += 1
|
| 147 |
+
embeddings = await self._embed_single_batch([processed])
|
| 148 |
+
return embeddings[0] if embeddings else []
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logger.error(f"embed_text 失败: {e}")
|
| 151 |
+
return []
|
| 152 |
+
|
| 153 |
+
async def embed_batch(
|
| 154 |
+
self,
|
| 155 |
+
texts: List[str],
|
| 156 |
+
show_progress: bool = False
|
| 157 |
+
) -> List[List[float]]:
|
| 158 |
+
"""
|
| 159 |
+
批量获取 Embedding (并发优化)
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
texts: 文本列表
|
| 163 |
+
show_progress: 是否显示进度日志
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
embedding 向量列表 (���输入顺序一致)
|
| 167 |
+
失败的文本对应空列表
|
| 168 |
+
"""
|
| 169 |
+
if not texts:
|
| 170 |
+
return []
|
| 171 |
+
|
| 172 |
+
# 预处理所有文本
|
| 173 |
+
processed_texts = [self._preprocess_text(t) for t in texts]
|
| 174 |
+
self._stats["total_texts"] += len(texts)
|
| 175 |
+
|
| 176 |
+
# 分批
|
| 177 |
+
batch_size = self.config.batch_size
|
| 178 |
+
batches = [
|
| 179 |
+
processed_texts[i:i + batch_size]
|
| 180 |
+
for i in range(0, len(processed_texts), batch_size)
|
| 181 |
+
]
|
| 182 |
+
|
| 183 |
+
total_batches = len(batches)
|
| 184 |
+
if show_progress:
|
| 185 |
+
logger.info(
|
| 186 |
+
f"📊 Embedding: {len(texts)} 文本 → {total_batches} 批次 "
|
| 187 |
+
f"(并发: {self.config.max_concurrent_batches})"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# 并发执行所有批次
|
| 191 |
+
tasks = [
|
| 192 |
+
self._embed_batch_with_semaphore(batch, idx)
|
| 193 |
+
for idx, batch in enumerate(batches)
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
# 收集结果
|
| 197 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 198 |
+
|
| 199 |
+
# 按批次索引排序并合并结果
|
| 200 |
+
embeddings = []
|
| 201 |
+
for result in sorted(results, key=lambda x: x[0] if isinstance(x, tuple) else float('inf')):
|
| 202 |
+
if isinstance(result, tuple):
|
| 203 |
+
batch_idx, batch_embeddings = result
|
| 204 |
+
embeddings.extend(batch_embeddings)
|
| 205 |
+
else:
|
| 206 |
+
# 异常情况: 填充空向量
|
| 207 |
+
# 找出这个批次有多少文本
|
| 208 |
+
failed_batch_size = batch_size # 保守估计
|
| 209 |
+
embeddings.extend([[] for _ in range(failed_batch_size)])
|
| 210 |
+
logger.warning(f"批次失败,填充 {failed_batch_size} 个空向量")
|
| 211 |
+
|
| 212 |
+
# 确保返回数量与输入一致
|
| 213 |
+
if len(embeddings) < len(texts):
|
| 214 |
+
embeddings.extend([[] for _ in range(len(texts) - len(embeddings))])
|
| 215 |
+
elif len(embeddings) > len(texts):
|
| 216 |
+
embeddings = embeddings[:len(texts)]
|
| 217 |
+
|
| 218 |
+
if show_progress:
|
| 219 |
+
success_count = sum(1 for e in embeddings if e)
|
| 220 |
+
logger.info(f"✅ Embedding 完成: {success_count}/{len(texts)} 成功")
|
| 221 |
+
|
| 222 |
+
return embeddings
|
| 223 |
+
|
| 224 |
+
def get_stats(self) -> dict:
|
| 225 |
+
"""获取统计信息"""
|
| 226 |
+
return self._stats.copy()
|
| 227 |
+
|
| 228 |
+
def reset_stats(self):
|
| 229 |
+
"""重置统计信息"""
|
| 230 |
+
for key in self._stats:
|
| 231 |
+
self._stats[key] = 0
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
# 全局单例
|
| 235 |
+
_embedding_service: Optional[EmbeddingService] = None
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def get_embedding_service(config: Optional[EmbeddingConfig] = None) -> EmbeddingService:
|
| 239 |
+
"""获取 Embedding 服务单例"""
|
| 240 |
+
global _embedding_service
|
| 241 |
+
if _embedding_service is None:
|
| 242 |
+
_embedding_service = EmbeddingService(config)
|
| 243 |
+
return _embedding_service
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
# 便捷函数
|
| 247 |
+
async def embed_text(text: str) -> List[float]:
|
| 248 |
+
"""快捷方式: 获取单个文本的 Embedding"""
|
| 249 |
+
return await get_embedding_service().embed_text(text)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
async def embed_batch(texts: List[str], show_progress: bool = False) -> List[List[float]]:
|
| 253 |
+
"""快捷方式: 批量获取 Embedding"""
|
| 254 |
+
return await get_embedding_service().embed_batch(texts, show_progress)
|
app/utils/github_client.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
GitHub 异步客户端
|
| 4 |
+
|
| 5 |
+
设计原则:
|
| 6 |
+
1. 异步非阻塞 - 使用 httpx.AsyncClient
|
| 7 |
+
2. 连接池复用 - 单例模式管理客户端生命周期
|
| 8 |
+
3. 自动重试 - 集成 tenacity 处理瞬时错误
|
| 9 |
+
4. 类型安全 - 完整的类型注解
|
| 10 |
+
5. 可扩展 - 易于添加新的 API 端点
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import asyncio
|
| 14 |
+
import base64
|
| 15 |
+
import logging
|
| 16 |
+
import os
|
| 17 |
+
from dataclasses import dataclass, field
|
| 18 |
+
from typing import List, Optional, Dict, Any, Set
|
| 19 |
+
from contextlib import asynccontextmanager
|
| 20 |
+
|
| 21 |
+
import httpx
|
| 22 |
+
|
| 23 |
+
from app.core.config import settings
|
| 24 |
+
from app.utils.retry import llm_retry # 复用已有的重试装饰器
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ============================================================
|
| 30 |
+
# 数据模型
|
| 31 |
+
# ============================================================
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class GitHubFile:
|
| 35 |
+
"""GitHub 文件信息"""
|
| 36 |
+
path: str
|
| 37 |
+
type: str # "blob" | "tree"
|
| 38 |
+
size: int = 0
|
| 39 |
+
sha: str = ""
|
| 40 |
+
|
| 41 |
+
@property
|
| 42 |
+
def is_file(self) -> bool:
|
| 43 |
+
return self.type == "blob"
|
| 44 |
+
|
| 45 |
+
@property
|
| 46 |
+
def is_directory(self) -> bool:
|
| 47 |
+
return self.type == "tree"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@dataclass
|
| 51 |
+
class GitHubRepo:
|
| 52 |
+
"""GitHub 仓库信息"""
|
| 53 |
+
owner: str
|
| 54 |
+
name: str
|
| 55 |
+
default_branch: str = "main"
|
| 56 |
+
description: str = ""
|
| 57 |
+
stars: int = 0
|
| 58 |
+
|
| 59 |
+
@property
|
| 60 |
+
def full_name(self) -> str:
|
| 61 |
+
return f"{self.owner}/{self.name}"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@dataclass
|
| 65 |
+
class FileFilter:
|
| 66 |
+
"""文件过滤配置"""
|
| 67 |
+
ignored_extensions: Set[str] = field(default_factory=lambda: {
|
| 68 |
+
'.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.mp4', '.webp',
|
| 69 |
+
'.pyc', '.pyo', '.lock', '.zip', '.tar', '.gz', '.pdf', '.woff', '.woff2',
|
| 70 |
+
'.DS_Store', '.gitignore', '.gitattributes', '.editorconfig'
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
ignored_directories: Set[str] = field(default_factory=lambda: {
|
| 74 |
+
'.git', '.github', '.vscode', '.idea', '__pycache__',
|
| 75 |
+
'node_modules', 'venv', 'env', '.env', 'build', 'dist',
|
| 76 |
+
'site-packages', 'migrations', '.next', '.nuxt', 'coverage',
|
| 77 |
+
'vendor', 'target', 'out', 'bin', 'obj'
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
max_file_size: int = 500_000 # 500KB
|
| 81 |
+
|
| 82 |
+
def should_include(self, file: GitHubFile) -> bool:
|
| 83 |
+
"""判断文件是否应该被包含"""
|
| 84 |
+
if not file.is_file:
|
| 85 |
+
return False
|
| 86 |
+
|
| 87 |
+
# 检查目录
|
| 88 |
+
path_parts = file.path.split("/")
|
| 89 |
+
if any(part in self.ignored_directories for part in path_parts):
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
# 检查扩展名
|
| 93 |
+
ext = os.path.splitext(file.path)[1].lower()
|
| 94 |
+
if ext in self.ignored_extensions:
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
# 检查文件大小
|
| 98 |
+
if file.size > self.max_file_size:
|
| 99 |
+
return False
|
| 100 |
+
|
| 101 |
+
return True
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# ============================================================
|
| 105 |
+
# 异常定义
|
| 106 |
+
# ============================================================
|
| 107 |
+
|
| 108 |
+
class GitHubError(Exception):
|
| 109 |
+
"""GitHub API 错误基类"""
|
| 110 |
+
def __init__(self, message: str, status_code: int = 0):
|
| 111 |
+
self.message = message
|
| 112 |
+
self.status_code = status_code
|
| 113 |
+
super().__init__(message)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class GitHubAuthError(GitHubError):
|
| 117 |
+
"""认证错误 (401)"""
|
| 118 |
+
pass
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class GitHubRateLimitError(GitHubError):
|
| 122 |
+
"""速率限制错误 (403)"""
|
| 123 |
+
pass
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class GitHubNotFoundError(GitHubError):
|
| 127 |
+
"""资源不存在 (404)"""
|
| 128 |
+
pass
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# ============================================================
|
| 132 |
+
# GitHub 异步客户端
|
| 133 |
+
# ============================================================
|
| 134 |
+
|
| 135 |
+
class GitHubClient:
|
| 136 |
+
"""
|
| 137 |
+
GitHub 异步 API 客户端
|
| 138 |
+
|
| 139 |
+
使用示例:
|
| 140 |
+
```python
|
| 141 |
+
async with GitHubClient() as client:
|
| 142 |
+
repo = await client.get_repo("owner", "repo")
|
| 143 |
+
files = await client.get_repo_tree(repo)
|
| 144 |
+
content = await client.get_file_content(repo, "README.md")
|
| 145 |
+
```
|
| 146 |
+
"""
|
| 147 |
+
|
| 148 |
+
BASE_URL = "https://api.github.com"
|
| 149 |
+
|
| 150 |
+
def __init__(
|
| 151 |
+
self,
|
| 152 |
+
token: Optional[str] = None,
|
| 153 |
+
timeout: float = 30.0,
|
| 154 |
+
max_concurrent_requests: int = 10
|
| 155 |
+
):
|
| 156 |
+
self.token = token or settings.GITHUB_TOKEN
|
| 157 |
+
self.timeout = timeout
|
| 158 |
+
self._client: Optional[httpx.AsyncClient] = None
|
| 159 |
+
self._semaphore = asyncio.Semaphore(max_concurrent_requests)
|
| 160 |
+
|
| 161 |
+
@property
|
| 162 |
+
def _headers(self) -> Dict[str, str]:
|
| 163 |
+
"""构建请求头"""
|
| 164 |
+
headers = {
|
| 165 |
+
"Accept": "application/vnd.github.v3+json",
|
| 166 |
+
"User-Agent": "GitHub-Agent-Demo/1.0"
|
| 167 |
+
}
|
| 168 |
+
if self.token:
|
| 169 |
+
headers["Authorization"] = f"Bearer {self.token}"
|
| 170 |
+
return headers
|
| 171 |
+
|
| 172 |
+
async def _ensure_client(self) -> httpx.AsyncClient:
|
| 173 |
+
"""确保客户端已初始化"""
|
| 174 |
+
if self._client is None or self._client.is_closed:
|
| 175 |
+
self._client = httpx.AsyncClient(
|
| 176 |
+
base_url=self.BASE_URL,
|
| 177 |
+
headers=self._headers,
|
| 178 |
+
timeout=httpx.Timeout(self.timeout),
|
| 179 |
+
follow_redirects=True,
|
| 180 |
+
limits=httpx.Limits(
|
| 181 |
+
max_keepalive_connections=20,
|
| 182 |
+
max_connections=50
|
| 183 |
+
)
|
| 184 |
+
)
|
| 185 |
+
return self._client
|
| 186 |
+
|
| 187 |
+
async def close(self):
|
| 188 |
+
"""关闭客户端连接"""
|
| 189 |
+
if self._client and not self._client.is_closed:
|
| 190 |
+
await self._client.aclose()
|
| 191 |
+
self._client = None
|
| 192 |
+
|
| 193 |
+
async def __aenter__(self):
|
| 194 |
+
await self._ensure_client()
|
| 195 |
+
return self
|
| 196 |
+
|
| 197 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 198 |
+
await self.close()
|
| 199 |
+
|
| 200 |
+
def _handle_error(self, response: httpx.Response, context: str = ""):
|
| 201 |
+
"""统一错误处理"""
|
| 202 |
+
status = response.status_code
|
| 203 |
+
|
| 204 |
+
try:
|
| 205 |
+
data = response.json()
|
| 206 |
+
message = data.get("message", response.text)
|
| 207 |
+
except Exception:
|
| 208 |
+
message = response.text
|
| 209 |
+
|
| 210 |
+
error_msg = f"{context}: {message}" if context else message
|
| 211 |
+
|
| 212 |
+
if status == 401:
|
| 213 |
+
raise GitHubAuthError(
|
| 214 |
+
"GitHub Token 无效或已过期,请检查 .env 配置",
|
| 215 |
+
status
|
| 216 |
+
)
|
| 217 |
+
elif status == 403:
|
| 218 |
+
if "rate limit" in message.lower():
|
| 219 |
+
raise GitHubRateLimitError(
|
| 220 |
+
"GitHub API 请求已达上限,请稍后重试或添加 Token",
|
| 221 |
+
status
|
| 222 |
+
)
|
| 223 |
+
raise GitHubError(error_msg, status)
|
| 224 |
+
elif status == 404:
|
| 225 |
+
raise GitHubNotFoundError(error_msg, status)
|
| 226 |
+
else:
|
| 227 |
+
raise GitHubError(error_msg, status)
|
| 228 |
+
|
| 229 |
+
@llm_retry
|
| 230 |
+
async def _request(
|
| 231 |
+
self,
|
| 232 |
+
method: str,
|
| 233 |
+
endpoint: str,
|
| 234 |
+
**kwargs
|
| 235 |
+
) -> Dict[str, Any]:
|
| 236 |
+
"""
|
| 237 |
+
发送 API 请求 (带重试)
|
| 238 |
+
|
| 239 |
+
Args:
|
| 240 |
+
method: HTTP 方法
|
| 241 |
+
endpoint: API 端点 (如 /repos/{owner}/{repo})
|
| 242 |
+
**kwargs: 传递给 httpx 的参数
|
| 243 |
+
|
| 244 |
+
Returns:
|
| 245 |
+
JSON 响应
|
| 246 |
+
"""
|
| 247 |
+
async with self._semaphore:
|
| 248 |
+
client = await self._ensure_client()
|
| 249 |
+
response = await client.request(method, endpoint, **kwargs)
|
| 250 |
+
|
| 251 |
+
if response.status_code >= 400:
|
| 252 |
+
self._handle_error(response, endpoint)
|
| 253 |
+
|
| 254 |
+
return response.json()
|
| 255 |
+
|
| 256 |
+
async def _request_raw(
|
| 257 |
+
self,
|
| 258 |
+
method: str,
|
| 259 |
+
endpoint: str,
|
| 260 |
+
**kwargs
|
| 261 |
+
) -> httpx.Response:
|
| 262 |
+
"""发送请求并返回原始响应"""
|
| 263 |
+
async with self._semaphore:
|
| 264 |
+
client = await self._ensure_client()
|
| 265 |
+
return await client.request(method, endpoint, **kwargs)
|
| 266 |
+
|
| 267 |
+
# --------------------------------------------------------
|
| 268 |
+
# 仓库相关 API
|
| 269 |
+
# --------------------------------------------------------
|
| 270 |
+
|
| 271 |
+
async def get_repo(self, owner: str, name: str) -> GitHubRepo:
|
| 272 |
+
"""获取仓库信息"""
|
| 273 |
+
data = await self._request("GET", f"/repos/{owner}/{name}")
|
| 274 |
+
|
| 275 |
+
return GitHubRepo(
|
| 276 |
+
owner=owner,
|
| 277 |
+
name=name,
|
| 278 |
+
default_branch=data.get("default_branch", "main"),
|
| 279 |
+
description=data.get("description", ""),
|
| 280 |
+
stars=data.get("stargazers_count", 0)
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
async def get_repo_tree(
|
| 284 |
+
self,
|
| 285 |
+
repo: GitHubRepo,
|
| 286 |
+
file_filter: Optional[FileFilter] = None
|
| 287 |
+
) -> List[GitHubFile]:
|
| 288 |
+
"""
|
| 289 |
+
获取仓库文件树
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
repo: 仓库信息
|
| 293 |
+
file_filter: 文件过滤器 (默认使用标准过滤)
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
过滤后的文件列表
|
| 297 |
+
"""
|
| 298 |
+
filter_config = file_filter or FileFilter()
|
| 299 |
+
|
| 300 |
+
data = await self._request(
|
| 301 |
+
"GET",
|
| 302 |
+
f"/repos/{repo.owner}/{repo.name}/git/trees/{repo.default_branch}",
|
| 303 |
+
params={"recursive": "1"}
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
files = []
|
| 307 |
+
for item in data.get("tree", []):
|
| 308 |
+
file = GitHubFile(
|
| 309 |
+
path=item["path"],
|
| 310 |
+
type=item["type"],
|
| 311 |
+
size=item.get("size", 0),
|
| 312 |
+
sha=item.get("sha", "")
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
if filter_config.should_include(file):
|
| 316 |
+
files.append(file)
|
| 317 |
+
|
| 318 |
+
logger.info(f"📂 仓库 {repo.full_name}: 共 {len(data.get('tree', []))} 项, 过滤后 {len(files)} 文件")
|
| 319 |
+
return files
|
| 320 |
+
|
| 321 |
+
# --------------------------------------------------------
|
| 322 |
+
# 文件内容 API
|
| 323 |
+
# --------------------------------------------------------
|
| 324 |
+
|
| 325 |
+
async def get_file_content(
|
| 326 |
+
self,
|
| 327 |
+
repo: GitHubRepo,
|
| 328 |
+
path: str
|
| 329 |
+
) -> Optional[str]:
|
| 330 |
+
"""
|
| 331 |
+
获取单个文件内容
|
| 332 |
+
|
| 333 |
+
Args:
|
| 334 |
+
repo: 仓库信息
|
| 335 |
+
path: 文件路径
|
| 336 |
+
|
| 337 |
+
Returns:
|
| 338 |
+
文件内容 (UTF-8 解码),失败返回 None
|
| 339 |
+
"""
|
| 340 |
+
try:
|
| 341 |
+
data = await self._request(
|
| 342 |
+
"GET",
|
| 343 |
+
f"/repos/{repo.owner}/{repo.name}/contents/{path}",
|
| 344 |
+
params={"ref": repo.default_branch}
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
# 处理目录情况
|
| 348 |
+
if isinstance(data, list):
|
| 349 |
+
file_names = [f["name"] for f in data]
|
| 350 |
+
return f"Directory '{path}' contains:\n" + "\n".join(
|
| 351 |
+
f"- {name}" for name in file_names
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
# 解码文件内容
|
| 355 |
+
content = data.get("content", "")
|
| 356 |
+
encoding = data.get("encoding", "base64")
|
| 357 |
+
|
| 358 |
+
if encoding == "base64":
|
| 359 |
+
return base64.b64decode(content).decode("utf-8")
|
| 360 |
+
|
| 361 |
+
return content
|
| 362 |
+
|
| 363 |
+
except GitHubNotFoundError:
|
| 364 |
+
logger.warning(f"文件不存在: {path}")
|
| 365 |
+
return None
|
| 366 |
+
except UnicodeDecodeError:
|
| 367 |
+
logger.warning(f"文件无法解码为 UTF-8: {path}")
|
| 368 |
+
return None
|
| 369 |
+
except Exception as e:
|
| 370 |
+
logger.error(f"获取文件失败 {path}: {e}")
|
| 371 |
+
return None
|
| 372 |
+
|
| 373 |
+
async def get_files_content(
|
| 374 |
+
self,
|
| 375 |
+
repo: GitHubRepo,
|
| 376 |
+
paths: List[str],
|
| 377 |
+
show_progress: bool = False
|
| 378 |
+
) -> Dict[str, Optional[str]]:
|
| 379 |
+
"""
|
| 380 |
+
批量获取文件内容 (并发优化)
|
| 381 |
+
|
| 382 |
+
Args:
|
| 383 |
+
repo: 仓库信息
|
| 384 |
+
paths: 文件路径列表
|
| 385 |
+
show_progress: 是否显示进度
|
| 386 |
+
|
| 387 |
+
Returns:
|
| 388 |
+
{path: content} 字典
|
| 389 |
+
"""
|
| 390 |
+
if not paths:
|
| 391 |
+
return {}
|
| 392 |
+
|
| 393 |
+
if show_progress:
|
| 394 |
+
logger.info(f"📥 开始下载 {len(paths)} 个文件 (并发: {self._semaphore._value})")
|
| 395 |
+
|
| 396 |
+
# 并发获取所有文件
|
| 397 |
+
tasks = [
|
| 398 |
+
self.get_file_content(repo, path)
|
| 399 |
+
for path in paths
|
| 400 |
+
]
|
| 401 |
+
|
| 402 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 403 |
+
|
| 404 |
+
# 组装结果
|
| 405 |
+
content_map = {}
|
| 406 |
+
success_count = 0
|
| 407 |
+
|
| 408 |
+
for path, result in zip(paths, results):
|
| 409 |
+
if isinstance(result, Exception):
|
| 410 |
+
logger.error(f"下载失败 {path}: {result}")
|
| 411 |
+
content_map[path] = None
|
| 412 |
+
else:
|
| 413 |
+
content_map[path] = result
|
| 414 |
+
if result is not None:
|
| 415 |
+
success_count += 1
|
| 416 |
+
|
| 417 |
+
if show_progress:
|
| 418 |
+
logger.info(f"✅ 文件下载完成: {success_count}/{len(paths)} 成功")
|
| 419 |
+
|
| 420 |
+
return content_map
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
# ============================================================
|
| 424 |
+
# 全局单例管理
|
| 425 |
+
# ============================================================
|
| 426 |
+
|
| 427 |
+
_github_client: Optional[GitHubClient] = None
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
def get_github_client() -> GitHubClient:
|
| 431 |
+
"""获取 GitHub 客户端单例"""
|
| 432 |
+
global _github_client
|
| 433 |
+
if _github_client is None:
|
| 434 |
+
_github_client = GitHubClient()
|
| 435 |
+
return _github_client
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
async def close_github_client():
|
| 439 |
+
"""关闭全局客户端 (应用关闭时调用)"""
|
| 440 |
+
global _github_client
|
| 441 |
+
if _github_client:
|
| 442 |
+
await _github_client.close()
|
| 443 |
+
_github_client = None
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
# ============================================================
|
| 447 |
+
# 便捷函数 (兼容旧接口)
|
| 448 |
+
# ============================================================
|
| 449 |
+
|
| 450 |
+
def parse_repo_url(url: str) -> Optional[tuple[str, str]]:
|
| 451 |
+
"""
|
| 452 |
+
解析 GitHub URL
|
| 453 |
+
|
| 454 |
+
Args:
|
| 455 |
+
url: GitHub 仓库 URL
|
| 456 |
+
|
| 457 |
+
Returns:
|
| 458 |
+
(owner, repo) 元组,无效返回 None
|
| 459 |
+
"""
|
| 460 |
+
if url.endswith(".git"):
|
| 461 |
+
url = url[:-4]
|
| 462 |
+
|
| 463 |
+
# 支持多种格式
|
| 464 |
+
# https://github.com/owner/repo
|
| 465 |
+
# github.com/owner/repo
|
| 466 |
+
# owner/repo
|
| 467 |
+
|
| 468 |
+
parts = url.replace("https://", "").replace("http://", "").split("/")
|
| 469 |
+
|
| 470 |
+
if "github.com" in parts:
|
| 471 |
+
idx = parts.index("github.com")
|
| 472 |
+
if len(parts) > idx + 2:
|
| 473 |
+
return (parts[idx + 1], parts[idx + 2])
|
| 474 |
+
elif len(parts) == 2:
|
| 475 |
+
# 直接是 owner/repo 格式
|
| 476 |
+
return (parts[0], parts[1])
|
| 477 |
+
|
| 478 |
+
return None
|
app/utils/llm_client.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/utils/llm_client.py
|
| 2 |
+
"""
|
| 3 |
+
统一 LLM 客户端入口
|
| 4 |
+
|
| 5 |
+
支持多个 LLM 供应商,通过 LLM_PROVIDER 环境变量切换:
|
| 6 |
+
- openai: OpenAI (GPT-4, GPT-4o 等)
|
| 7 |
+
- deepseek: DeepSeek (deepseek-chat, deepseek-coder 等)
|
| 8 |
+
- anthropic: Anthropic (Claude 3.5, Claude 3 等)
|
| 9 |
+
- gemini: Google Gemini (gemini-1.5-pro 等)
|
| 10 |
+
|
| 11 |
+
使用方式 (与原来完全兼容):
|
| 12 |
+
from app.utils.llm_client import client
|
| 13 |
+
|
| 14 |
+
response = await client.chat.completions.create(
|
| 15 |
+
model=settings.default_model_name,
|
| 16 |
+
messages=[{"role": "user", "content": "Hello"}],
|
| 17 |
+
stream=True
|
| 18 |
+
)
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from app.core.config import settings
|
| 22 |
+
from app.utils.llm_providers import LLMFactory, BaseLLMProvider
|
| 23 |
+
from typing import Optional
|
| 24 |
+
|
| 25 |
+
# 全局客户端实例
|
| 26 |
+
client: Optional[BaseLLMProvider] = None
|
| 27 |
+
|
| 28 |
+
def _initialize_client() -> Optional[BaseLLMProvider]:
|
| 29 |
+
"""
|
| 30 |
+
初始化 LLM 客户端
|
| 31 |
+
|
| 32 |
+
根据配置的 LLM_PROVIDER 创建对应的客户端实例。
|
| 33 |
+
"""
|
| 34 |
+
provider = settings.LLM_PROVIDER.lower()
|
| 35 |
+
api_key = settings.current_api_key
|
| 36 |
+
base_url = settings.current_base_url
|
| 37 |
+
model_name = settings.default_model_name
|
| 38 |
+
|
| 39 |
+
if not api_key:
|
| 40 |
+
print(f"❌ 未找到 {provider.upper()}_API_KEY")
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
return LLMFactory.create(
|
| 45 |
+
provider=provider,
|
| 46 |
+
api_key=api_key,
|
| 47 |
+
model_name=model_name,
|
| 48 |
+
base_url=base_url,
|
| 49 |
+
temperature=settings.LLM_TEMPERATURE,
|
| 50 |
+
max_tokens=settings.LLM_MAX_TOKENS,
|
| 51 |
+
timeout=settings.LLM_TIMEOUT,
|
| 52 |
+
)
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"❌ LLM Client 初始化失败: {e}")
|
| 55 |
+
return None
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def get_client() -> Optional[BaseLLMProvider]:
|
| 59 |
+
"""
|
| 60 |
+
获取 LLM 客户端实例
|
| 61 |
+
|
| 62 |
+
如果客户端尚未初始化,会自动初始化。
|
| 63 |
+
"""
|
| 64 |
+
global client
|
| 65 |
+
if client is None:
|
| 66 |
+
client = _initialize_client()
|
| 67 |
+
return client
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def reinitialize_client(
|
| 71 |
+
provider: str = None,
|
| 72 |
+
api_key: str = None,
|
| 73 |
+
model_name: str = None,
|
| 74 |
+
base_url: str = None,
|
| 75 |
+
) -> Optional[BaseLLMProvider]:
|
| 76 |
+
"""
|
| 77 |
+
重新初始化客户端
|
| 78 |
+
|
| 79 |
+
用于运行时切换 LLM 供应商或模型。
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
provider: 新的供应商 (可选)
|
| 83 |
+
api_key: 新的 API Key (可选)
|
| 84 |
+
model_name: 新的模型名称 (可选)
|
| 85 |
+
base_url: 新的 Base URL (可选)
|
| 86 |
+
"""
|
| 87 |
+
global client
|
| 88 |
+
|
| 89 |
+
_provider = provider or settings.LLM_PROVIDER
|
| 90 |
+
_api_key = api_key or settings.current_api_key
|
| 91 |
+
_model_name = model_name or settings.default_model_name
|
| 92 |
+
_base_url = base_url or settings.current_base_url
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
client = LLMFactory.create(
|
| 96 |
+
provider=_provider,
|
| 97 |
+
api_key=_api_key,
|
| 98 |
+
model_name=_model_name,
|
| 99 |
+
base_url=_base_url,
|
| 100 |
+
)
|
| 101 |
+
return client
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"❌ 重新初始化失败: {e}")
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# 自动初始化客户端
|
| 108 |
+
client = _initialize_client()
|
app/utils/llm_providers/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/utils/llm_providers/__init__.py
|
| 2 |
+
"""
|
| 3 |
+
多 LLM 供应商支持模块
|
| 4 |
+
|
| 5 |
+
支持的供应商:
|
| 6 |
+
- OpenAI (GPT-4, GPT-4o, GPT-3.5-turbo 等)
|
| 7 |
+
- DeepSeek (deepseek-chat, deepseek-coder 等)
|
| 8 |
+
- Anthropic (Claude 3.5, Claude 3 等)
|
| 9 |
+
- Google Gemini (gemini-pro, gemini-1.5-pro 等)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from .base import BaseLLMProvider, LLMResponse, LLMConfig
|
| 13 |
+
from .openai_provider import OpenAIProvider
|
| 14 |
+
from .deepseek_provider import DeepSeekProvider
|
| 15 |
+
from .anthropic_provider import AnthropicProvider
|
| 16 |
+
from .gemini_provider import GeminiProvider
|
| 17 |
+
from .factory import LLMFactory, get_llm_client
|
| 18 |
+
|
| 19 |
+
__all__ = [
|
| 20 |
+
"BaseLLMProvider",
|
| 21 |
+
"LLMResponse",
|
| 22 |
+
"LLMConfig",
|
| 23 |
+
"OpenAIProvider",
|
| 24 |
+
"DeepSeekProvider",
|
| 25 |
+
"AnthropicProvider",
|
| 26 |
+
"GeminiProvider",
|
| 27 |
+
"LLMFactory",
|
| 28 |
+
"get_llm_client",
|
| 29 |
+
]
|
app/utils/llm_providers/anthropic_provider.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/utils/llm_providers/anthropic_provider.py
|
| 2 |
+
"""
|
| 3 |
+
Anthropic (Claude) LLM 提供商实现
|
| 4 |
+
|
| 5 |
+
支持模型: claude-3-5-sonnet, claude-3-opus, claude-3-haiku 等
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import uuid
|
| 9 |
+
import time
|
| 10 |
+
from typing import List, AsyncIterator
|
| 11 |
+
|
| 12 |
+
from .base import (
|
| 13 |
+
BaseLLMProvider,
|
| 14 |
+
LLMConfig,
|
| 15 |
+
LLMMessage,
|
| 16 |
+
LLMResponse,
|
| 17 |
+
LLMChoice,
|
| 18 |
+
LLMUsage,
|
| 19 |
+
LLMProviderType
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class AnthropicProvider(BaseLLMProvider):
|
| 24 |
+
"""
|
| 25 |
+
Anthropic (Claude) API 提供商
|
| 26 |
+
|
| 27 |
+
注意: Anthropic 的消息格式与 OpenAI 略有不同:
|
| 28 |
+
- system 消息需要单独传递
|
| 29 |
+
- messages 只包含 user/assistant 轮次
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, config: LLMConfig):
|
| 33 |
+
super().__init__(config)
|
| 34 |
+
try:
|
| 35 |
+
from anthropic import AsyncAnthropic
|
| 36 |
+
self._client = AsyncAnthropic(
|
| 37 |
+
api_key=config.api_key,
|
| 38 |
+
timeout=config.timeout
|
| 39 |
+
)
|
| 40 |
+
self._available = True
|
| 41 |
+
except ImportError:
|
| 42 |
+
print("⚠️ anthropic 包未安装,请运行: pip install anthropic")
|
| 43 |
+
self._client = None
|
| 44 |
+
self._available = False
|
| 45 |
+
|
| 46 |
+
def _extract_system_message(self, messages: List[LLMMessage]) -> tuple:
|
| 47 |
+
"""
|
| 48 |
+
提取 system 消息
|
| 49 |
+
|
| 50 |
+
Anthropic 需要将 system 消息单独传递,
|
| 51 |
+
不能放在 messages 列表中。
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
(system_prompt, filtered_messages)
|
| 55 |
+
"""
|
| 56 |
+
system_prompt = None
|
| 57 |
+
filtered_messages = []
|
| 58 |
+
|
| 59 |
+
for msg in messages:
|
| 60 |
+
if msg.role == "system":
|
| 61 |
+
system_prompt = msg.content
|
| 62 |
+
else:
|
| 63 |
+
filtered_messages.append(msg)
|
| 64 |
+
|
| 65 |
+
return system_prompt, filtered_messages
|
| 66 |
+
|
| 67 |
+
async def chat_completions_create(
|
| 68 |
+
self,
|
| 69 |
+
messages: List[LLMMessage],
|
| 70 |
+
model: str,
|
| 71 |
+
temperature: float,
|
| 72 |
+
max_tokens: int,
|
| 73 |
+
timeout: int,
|
| 74 |
+
**kwargs
|
| 75 |
+
) -> LLMResponse:
|
| 76 |
+
"""非流式请求"""
|
| 77 |
+
if not self._available:
|
| 78 |
+
raise RuntimeError("Anthropic client not available. Please install: pip install anthropic")
|
| 79 |
+
|
| 80 |
+
system_prompt, filtered_messages = self._extract_system_message(messages)
|
| 81 |
+
|
| 82 |
+
# 转换消息格式
|
| 83 |
+
api_messages = [
|
| 84 |
+
{"role": m.role, "content": m.content}
|
| 85 |
+
for m in filtered_messages
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
# 构建请求参数
|
| 89 |
+
request_params = {
|
| 90 |
+
"model": model,
|
| 91 |
+
"messages": api_messages,
|
| 92 |
+
"temperature": temperature,
|
| 93 |
+
"max_tokens": max_tokens,
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
if system_prompt:
|
| 97 |
+
request_params["system"] = system_prompt
|
| 98 |
+
|
| 99 |
+
response = await self._client.messages.create(**request_params)
|
| 100 |
+
|
| 101 |
+
# 转换为统一格式
|
| 102 |
+
content = ""
|
| 103 |
+
if response.content:
|
| 104 |
+
# Anthropic 的 content 是一个 list
|
| 105 |
+
for block in response.content:
|
| 106 |
+
if hasattr(block, 'text'):
|
| 107 |
+
content += block.text
|
| 108 |
+
|
| 109 |
+
choices = [
|
| 110 |
+
LLMChoice(
|
| 111 |
+
index=0,
|
| 112 |
+
message=LLMMessage(role="assistant", content=content),
|
| 113 |
+
finish_reason=response.stop_reason
|
| 114 |
+
)
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
usage = LLMUsage(
|
| 118 |
+
prompt_tokens=response.usage.input_tokens,
|
| 119 |
+
completion_tokens=response.usage.output_tokens,
|
| 120 |
+
total_tokens=response.usage.input_tokens + response.usage.output_tokens
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
return LLMResponse(
|
| 124 |
+
id=response.id,
|
| 125 |
+
model=response.model,
|
| 126 |
+
choices=choices,
|
| 127 |
+
usage=usage,
|
| 128 |
+
created=int(time.time())
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
async def chat_completions_create_stream(
|
| 132 |
+
self,
|
| 133 |
+
messages: List[LLMMessage],
|
| 134 |
+
model: str,
|
| 135 |
+
temperature: float,
|
| 136 |
+
max_tokens: int,
|
| 137 |
+
timeout: int,
|
| 138 |
+
**kwargs
|
| 139 |
+
) -> AsyncIterator[LLMResponse]:
|
| 140 |
+
"""流式请求"""
|
| 141 |
+
if not self._available:
|
| 142 |
+
raise RuntimeError("Anthropic client not available. Please install: pip install anthropic")
|
| 143 |
+
|
| 144 |
+
system_prompt, filtered_messages = self._extract_system_message(messages)
|
| 145 |
+
|
| 146 |
+
api_messages = [
|
| 147 |
+
{"role": m.role, "content": m.content}
|
| 148 |
+
for m in filtered_messages
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
request_params = {
|
| 152 |
+
"model": model,
|
| 153 |
+
"messages": api_messages,
|
| 154 |
+
"temperature": temperature,
|
| 155 |
+
"max_tokens": max_tokens,
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
if system_prompt:
|
| 159 |
+
request_params["system"] = system_prompt
|
| 160 |
+
|
| 161 |
+
response_id = f"msg_{uuid.uuid4().hex[:24]}"
|
| 162 |
+
|
| 163 |
+
async with self._client.messages.stream(**request_params) as stream:
|
| 164 |
+
async for text in stream.text_stream:
|
| 165 |
+
choices = [
|
| 166 |
+
LLMChoice(
|
| 167 |
+
index=0,
|
| 168 |
+
delta=LLMMessage(role="assistant", content=text),
|
| 169 |
+
finish_reason=None
|
| 170 |
+
)
|
| 171 |
+
]
|
| 172 |
+
yield LLMResponse(
|
| 173 |
+
id=response_id,
|
| 174 |
+
model=model,
|
| 175 |
+
choices=choices,
|
| 176 |
+
created=int(time.time())
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
def validate_connection(self) -> bool:
|
| 180 |
+
"""验证连接"""
|
| 181 |
+
return self._available and bool(self.config.api_key)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def create_anthropic_provider(
|
| 185 |
+
api_key: str,
|
| 186 |
+
model_name: str = "claude-3-5-sonnet-20241022",
|
| 187 |
+
**kwargs
|
| 188 |
+
) -> AnthropicProvider:
|
| 189 |
+
"""工厂函数:创建 Anthropic 提供商"""
|
| 190 |
+
config = LLMConfig(
|
| 191 |
+
provider=LLMProviderType.ANTHROPIC,
|
| 192 |
+
api_key=api_key,
|
| 193 |
+
model_name=model_name,
|
| 194 |
+
**kwargs
|
| 195 |
+
)
|
| 196 |
+
return AnthropicProvider(config)
|
app/utils/llm_providers/base.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/utils/llm_providers/base.py
|
| 2 |
+
"""
|
| 3 |
+
LLM 提供商基类定义
|
| 4 |
+
|
| 5 |
+
定义统一的接口规范,所有供应商实现都必须遵循此规范。
|
| 6 |
+
采用适配器模式,将不同供应商的 API 统一为 OpenAI 兼容格式。
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from abc import ABC, abstractmethod
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from typing import List, Dict, Any, Optional, AsyncIterator, Union
|
| 13 |
+
from enum import Enum
|
| 14 |
+
|
| 15 |
+
from app.utils.retry import llm_retry, is_retryable_error
|
| 16 |
+
|
| 17 |
+
# 配置日志
|
| 18 |
+
logger = logging.getLogger("llm_provider")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class LLMProviderType(str, Enum):
|
| 22 |
+
"""支持的 LLM 供应商类型"""
|
| 23 |
+
OPENAI = "openai"
|
| 24 |
+
DEEPSEEK = "deepseek"
|
| 25 |
+
ANTHROPIC = "anthropic"
|
| 26 |
+
GEMINI = "gemini"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class LLMConfig:
|
| 31 |
+
"""LLM 配置"""
|
| 32 |
+
provider: LLMProviderType
|
| 33 |
+
api_key: str
|
| 34 |
+
model_name: str
|
| 35 |
+
base_url: Optional[str] = None
|
| 36 |
+
temperature: float = 0.1
|
| 37 |
+
max_tokens: int = 4096
|
| 38 |
+
timeout: int = 600
|
| 39 |
+
extra_params: Dict[str, Any] = field(default_factory=dict)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class LLMMessage:
|
| 44 |
+
"""消息格式 (兼容 OpenAI)"""
|
| 45 |
+
role: str # "system", "user", "assistant"
|
| 46 |
+
content: str
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class LLMUsage:
|
| 51 |
+
"""Token 使用量"""
|
| 52 |
+
prompt_tokens: int = 0
|
| 53 |
+
completion_tokens: int = 0
|
| 54 |
+
total_tokens: int = 0
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@dataclass
|
| 58 |
+
class LLMChoice:
|
| 59 |
+
"""响应选项 (兼容 OpenAI)"""
|
| 60 |
+
index: int
|
| 61 |
+
message: Optional[LLMMessage] = None
|
| 62 |
+
delta: Optional[LLMMessage] = None # 流式响应时使用
|
| 63 |
+
finish_reason: Optional[str] = None
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@dataclass
|
| 67 |
+
class LLMResponse:
|
| 68 |
+
"""
|
| 69 |
+
统一的 LLM 响应格式
|
| 70 |
+
|
| 71 |
+
设计为兼容 OpenAI 的 ChatCompletion 格式,
|
| 72 |
+
使得现有代码无需大幅修改即可使用。
|
| 73 |
+
"""
|
| 74 |
+
id: str
|
| 75 |
+
model: str
|
| 76 |
+
choices: List[LLMChoice]
|
| 77 |
+
usage: Optional[LLMUsage] = None
|
| 78 |
+
created: int = 0
|
| 79 |
+
|
| 80 |
+
@property
|
| 81 |
+
def content(self) -> str:
|
| 82 |
+
"""便捷方法:获取第一个选项的内容"""
|
| 83 |
+
if self.choices and self.choices[0].message:
|
| 84 |
+
return self.choices[0].message.content
|
| 85 |
+
return ""
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# 辅助类定义(在 BaseLLMProvider 外部,避免嵌套类问题)
|
| 89 |
+
class _CompletionsNamespace:
|
| 90 |
+
"""模拟 client.chat.completions 命名空间"""
|
| 91 |
+
def __init__(self, provider: 'BaseLLMProvider'):
|
| 92 |
+
self._provider = provider
|
| 93 |
+
|
| 94 |
+
async def create(
|
| 95 |
+
self,
|
| 96 |
+
model: str = None,
|
| 97 |
+
messages: List[Dict[str, str]] = None,
|
| 98 |
+
temperature: float = None,
|
| 99 |
+
max_tokens: int = None,
|
| 100 |
+
stream: bool = False,
|
| 101 |
+
timeout: int = None,
|
| 102 |
+
**kwargs
|
| 103 |
+
) -> Union[LLMResponse, AsyncIterator[LLMResponse]]:
|
| 104 |
+
"""
|
| 105 |
+
统一的 completions.create 接口
|
| 106 |
+
|
| 107 |
+
兼容 OpenAI SDK 调用方式:
|
| 108 |
+
response = await client.chat.completions.create(
|
| 109 |
+
model="gpt-4",
|
| 110 |
+
messages=[{"role": "user", "content": "Hello"}],
|
| 111 |
+
stream=True
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
内置重试机制:
|
| 115 |
+
- 自动重试网络错误、超时、速率限制
|
| 116 |
+
- 指数退避策略
|
| 117 |
+
- 最多重试 3 次
|
| 118 |
+
"""
|
| 119 |
+
# 合并配置
|
| 120 |
+
_model = model or self._provider.config.model_name
|
| 121 |
+
_temperature = temperature if temperature is not None else self._provider.config.temperature
|
| 122 |
+
_max_tokens = max_tokens or self._provider.config.max_tokens
|
| 123 |
+
_timeout = timeout or self._provider.config.timeout
|
| 124 |
+
|
| 125 |
+
# 转换消息格式
|
| 126 |
+
_messages = [
|
| 127 |
+
LLMMessage(role=m["role"], content=m["content"])
|
| 128 |
+
for m in (messages or [])
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
if stream:
|
| 132 |
+
# 流式请求: 返回带重试的异步生成器
|
| 133 |
+
return self._create_stream_with_retry(
|
| 134 |
+
messages=_messages,
|
| 135 |
+
model=_model,
|
| 136 |
+
temperature=_temperature,
|
| 137 |
+
max_tokens=_max_tokens,
|
| 138 |
+
timeout=_timeout,
|
| 139 |
+
**kwargs
|
| 140 |
+
)
|
| 141 |
+
else:
|
| 142 |
+
# 非流式请求: 使用 tenacity 重试
|
| 143 |
+
return await self._create_with_retry(
|
| 144 |
+
messages=_messages,
|
| 145 |
+
model=_model,
|
| 146 |
+
temperature=_temperature,
|
| 147 |
+
max_tokens=_max_tokens,
|
| 148 |
+
timeout=_timeout,
|
| 149 |
+
**kwargs
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
@llm_retry
|
| 153 |
+
async def _create_with_retry(
|
| 154 |
+
self,
|
| 155 |
+
messages: List[LLMMessage],
|
| 156 |
+
model: str,
|
| 157 |
+
temperature: float,
|
| 158 |
+
max_tokens: int,
|
| 159 |
+
timeout: int,
|
| 160 |
+
**kwargs
|
| 161 |
+
) -> LLMResponse:
|
| 162 |
+
"""带重试的非流式请求"""
|
| 163 |
+
logger.debug(f"🔄 LLM 请求: model={model}, messages_count={len(messages)}")
|
| 164 |
+
return await self._provider.chat_completions_create(
|
| 165 |
+
messages=messages,
|
| 166 |
+
model=model,
|
| 167 |
+
temperature=temperature,
|
| 168 |
+
max_tokens=max_tokens,
|
| 169 |
+
timeout=timeout,
|
| 170 |
+
**kwargs
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
async def _create_stream_with_retry(
|
| 174 |
+
self,
|
| 175 |
+
messages: List[LLMMessage],
|
| 176 |
+
model: str,
|
| 177 |
+
temperature: float,
|
| 178 |
+
max_tokens: int,
|
| 179 |
+
timeout: int,
|
| 180 |
+
max_retries: int = 3,
|
| 181 |
+
**kwargs
|
| 182 |
+
) -> AsyncIterator[LLMResponse]:
|
| 183 |
+
"""
|
| 184 |
+
带重试的流式请求
|
| 185 |
+
|
| 186 |
+
注意: 流式请求的重试策略与非流式不同
|
| 187 |
+
- 如果在获取流之前失败,可以重试
|
| 188 |
+
- 如果在流传输过程中失败,需要重新开始
|
| 189 |
+
"""
|
| 190 |
+
last_error = None
|
| 191 |
+
|
| 192 |
+
for attempt in range(1, max_retries + 1):
|
| 193 |
+
try:
|
| 194 |
+
logger.debug(f"🔄 LLM 流式请求 (尝试 {attempt}/{max_retries}): model={model}")
|
| 195 |
+
|
| 196 |
+
# 获取流生成器
|
| 197 |
+
stream = self._provider.chat_completions_create_stream(
|
| 198 |
+
messages=messages,
|
| 199 |
+
model=model,
|
| 200 |
+
temperature=temperature,
|
| 201 |
+
max_tokens=max_tokens,
|
| 202 |
+
timeout=timeout,
|
| 203 |
+
**kwargs
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# 迭代流并 yield
|
| 207 |
+
async for chunk in stream:
|
| 208 |
+
yield chunk
|
| 209 |
+
|
| 210 |
+
# 成功完成,退出重试循环
|
| 211 |
+
return
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
last_error = e
|
| 215 |
+
if is_retryable_error(e) and attempt < max_retries:
|
| 216 |
+
wait_time = min(2 ** attempt, 30) # 指数退避
|
| 217 |
+
logger.warning(
|
| 218 |
+
f"🔄 LLM 流式请求失败 (尝试 {attempt}/{max_retries}): "
|
| 219 |
+
f"{type(e).__name__}: {e}. 等待 {wait_time}s 后重试..."
|
| 220 |
+
)
|
| 221 |
+
import asyncio
|
| 222 |
+
await asyncio.sleep(wait_time)
|
| 223 |
+
else:
|
| 224 |
+
# 不可重试的错误或已达到最大重试次数
|
| 225 |
+
logger.error(f"❌ LLM 流式请求最终失败: {type(e).__name__}: {e}")
|
| 226 |
+
raise
|
| 227 |
+
|
| 228 |
+
# 如果走到这里,说明所有重试都失败了
|
| 229 |
+
if last_error:
|
| 230 |
+
raise last_error
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
class _ChatNamespace:
|
| 234 |
+
"""模拟 client.chat 命名空间"""
|
| 235 |
+
def __init__(self, provider: 'BaseLLMProvider'):
|
| 236 |
+
self._provider = provider
|
| 237 |
+
self.completions = _CompletionsNamespace(provider)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
class BaseLLMProvider(ABC):
|
| 241 |
+
"""
|
| 242 |
+
LLM 提供商抽象基类
|
| 243 |
+
|
| 244 |
+
所有供应商实现都需要继承此类并实现以下方法:
|
| 245 |
+
- chat_completions_create: 非流式请求
|
| 246 |
+
- chat_completions_create_stream: 流式请求
|
| 247 |
+
|
| 248 |
+
为了兼容现有代码,提供一个模拟 OpenAI 客户端的 chat.completions 接口。
|
| 249 |
+
"""
|
| 250 |
+
|
| 251 |
+
def __init__(self, config: LLMConfig):
|
| 252 |
+
self.config = config
|
| 253 |
+
self._client = None
|
| 254 |
+
# 模拟 OpenAI SDK 的接口结构
|
| 255 |
+
self.chat = _ChatNamespace(self)
|
| 256 |
+
|
| 257 |
+
@abstractmethod
|
| 258 |
+
async def chat_completions_create(
|
| 259 |
+
self,
|
| 260 |
+
messages: List[LLMMessage],
|
| 261 |
+
model: str,
|
| 262 |
+
temperature: float,
|
| 263 |
+
max_tokens: int,
|
| 264 |
+
timeout: int,
|
| 265 |
+
**kwargs
|
| 266 |
+
) -> LLMResponse:
|
| 267 |
+
"""
|
| 268 |
+
非流式 Chat Completion 请求
|
| 269 |
+
|
| 270 |
+
Args:
|
| 271 |
+
messages: 消息列表
|
| 272 |
+
model: 模型名称
|
| 273 |
+
temperature: 温度参数
|
| 274 |
+
max_tokens: 最大 Token 数
|
| 275 |
+
timeout: 超时时间
|
| 276 |
+
|
| 277 |
+
Returns:
|
| 278 |
+
LLMResponse: 统一格式的响应
|
| 279 |
+
"""
|
| 280 |
+
pass
|
| 281 |
+
|
| 282 |
+
@abstractmethod
|
| 283 |
+
async def chat_completions_create_stream(
|
| 284 |
+
self,
|
| 285 |
+
messages: List[LLMMessage],
|
| 286 |
+
model: str,
|
| 287 |
+
temperature: float,
|
| 288 |
+
max_tokens: int,
|
| 289 |
+
timeout: int,
|
| 290 |
+
**kwargs
|
| 291 |
+
) -> AsyncIterator[LLMResponse]:
|
| 292 |
+
"""
|
| 293 |
+
流式 Chat Completion 请求
|
| 294 |
+
|
| 295 |
+
Args:
|
| 296 |
+
messages: 消息列表
|
| 297 |
+
model: 模型名称
|
| 298 |
+
temperature: 温度参数
|
| 299 |
+
max_tokens: 最大 Token 数
|
| 300 |
+
timeout: 超时时间
|
| 301 |
+
|
| 302 |
+
Yields:
|
| 303 |
+
LLMResponse: 流式响应块
|
| 304 |
+
"""
|
| 305 |
+
pass
|
| 306 |
+
|
| 307 |
+
@abstractmethod
|
| 308 |
+
def validate_connection(self) -> bool:
|
| 309 |
+
"""验证连接是否正常"""
|
| 310 |
+
pass
|
| 311 |
+
|
| 312 |
+
@property
|
| 313 |
+
def provider_name(self) -> str:
|
| 314 |
+
"""获取供应商名称"""
|
| 315 |
+
return self.config.provider.value
|
| 316 |
+
|
| 317 |
+
@property
|
| 318 |
+
def model_name(self) -> str:
|
| 319 |
+
"""获取当前模型名称"""
|
| 320 |
+
return self.config.model_name
|
app/utils/llm_providers/deepseek_provider.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/utils/llm_providers/deepseek_provider.py
|
| 2 |
+
"""
|
| 3 |
+
DeepSeek LLM 提供商实现
|
| 4 |
+
|
| 5 |
+
DeepSeek API 兼容 OpenAI 协议,因此直接复用 OpenAI SDK。
|
| 6 |
+
支持模型: deepseek-chat, deepseek-coder, deepseek-reasoner 等
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import List, AsyncIterator
|
| 10 |
+
from openai import AsyncOpenAI
|
| 11 |
+
|
| 12 |
+
from .base import (
|
| 13 |
+
BaseLLMProvider,
|
| 14 |
+
LLMConfig,
|
| 15 |
+
LLMMessage,
|
| 16 |
+
LLMResponse,
|
| 17 |
+
LLMChoice,
|
| 18 |
+
LLMUsage,
|
| 19 |
+
LLMProviderType
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# DeepSeek 默认 API 端点
|
| 24 |
+
DEEPSEEK_DEFAULT_BASE_URL = "https://api.deepseek.com"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class DeepSeekProvider(BaseLLMProvider):
|
| 28 |
+
"""
|
| 29 |
+
DeepSeek API 提供商
|
| 30 |
+
|
| 31 |
+
DeepSeek 使用 OpenAI 兼容协议,因此可以直接使用 OpenAI SDK。
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def __init__(self, config: LLMConfig):
|
| 35 |
+
super().__init__(config)
|
| 36 |
+
# 确保使用正确的 base_url
|
| 37 |
+
base_url = config.base_url or DEEPSEEK_DEFAULT_BASE_URL
|
| 38 |
+
self._client = AsyncOpenAI(
|
| 39 |
+
api_key=config.api_key,
|
| 40 |
+
base_url=base_url,
|
| 41 |
+
timeout=config.timeout
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
async def chat_completions_create(
|
| 45 |
+
self,
|
| 46 |
+
messages: List[LLMMessage],
|
| 47 |
+
model: str,
|
| 48 |
+
temperature: float,
|
| 49 |
+
max_tokens: int,
|
| 50 |
+
timeout: int,
|
| 51 |
+
**kwargs
|
| 52 |
+
) -> LLMResponse:
|
| 53 |
+
"""非流式请求"""
|
| 54 |
+
api_messages = [
|
| 55 |
+
{"role": m.role, "content": m.content}
|
| 56 |
+
for m in messages
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
response = await self._client.chat.completions.create(
|
| 60 |
+
model=model,
|
| 61 |
+
messages=api_messages,
|
| 62 |
+
temperature=temperature,
|
| 63 |
+
max_tokens=max_tokens,
|
| 64 |
+
timeout=timeout,
|
| 65 |
+
**kwargs
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
choices = [
|
| 69 |
+
LLMChoice(
|
| 70 |
+
index=c.index,
|
| 71 |
+
message=LLMMessage(role=c.message.role, content=c.message.content),
|
| 72 |
+
finish_reason=c.finish_reason
|
| 73 |
+
)
|
| 74 |
+
for c in response.choices
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
usage = None
|
| 78 |
+
if response.usage:
|
| 79 |
+
usage = LLMUsage(
|
| 80 |
+
prompt_tokens=response.usage.prompt_tokens,
|
| 81 |
+
completion_tokens=response.usage.completion_tokens,
|
| 82 |
+
total_tokens=response.usage.total_tokens
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
return LLMResponse(
|
| 86 |
+
id=response.id,
|
| 87 |
+
model=response.model,
|
| 88 |
+
choices=choices,
|
| 89 |
+
usage=usage,
|
| 90 |
+
created=response.created
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
async def chat_completions_create_stream(
|
| 94 |
+
self,
|
| 95 |
+
messages: List[LLMMessage],
|
| 96 |
+
model: str,
|
| 97 |
+
temperature: float,
|
| 98 |
+
max_tokens: int,
|
| 99 |
+
timeout: int,
|
| 100 |
+
**kwargs
|
| 101 |
+
) -> AsyncIterator[LLMResponse]:
|
| 102 |
+
"""流式请求"""
|
| 103 |
+
api_messages = [
|
| 104 |
+
{"role": m.role, "content": m.content}
|
| 105 |
+
for m in messages
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
stream = await self._client.chat.completions.create(
|
| 109 |
+
model=model,
|
| 110 |
+
messages=api_messages,
|
| 111 |
+
temperature=temperature,
|
| 112 |
+
max_tokens=max_tokens,
|
| 113 |
+
timeout=timeout,
|
| 114 |
+
stream=True,
|
| 115 |
+
**kwargs
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
async for chunk in stream:
|
| 119 |
+
if chunk.choices:
|
| 120 |
+
delta_content = chunk.choices[0].delta.content or ""
|
| 121 |
+
choices = [
|
| 122 |
+
LLMChoice(
|
| 123 |
+
index=0,
|
| 124 |
+
delta=LLMMessage(role="assistant", content=delta_content),
|
| 125 |
+
finish_reason=chunk.choices[0].finish_reason
|
| 126 |
+
)
|
| 127 |
+
]
|
| 128 |
+
yield LLMResponse(
|
| 129 |
+
id=chunk.id,
|
| 130 |
+
model=chunk.model,
|
| 131 |
+
choices=choices,
|
| 132 |
+
created=chunk.created
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
def validate_connection(self) -> bool:
|
| 136 |
+
"""验证 API Key 有效性"""
|
| 137 |
+
return bool(self.config.api_key)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def create_deepseek_provider(
|
| 141 |
+
api_key: str,
|
| 142 |
+
model_name: str = "deepseek-chat",
|
| 143 |
+
base_url: str = None,
|
| 144 |
+
**kwargs
|
| 145 |
+
) -> DeepSeekProvider:
|
| 146 |
+
"""工厂函数:创建 DeepSeek 提供商"""
|
| 147 |
+
config = LLMConfig(
|
| 148 |
+
provider=LLMProviderType.DEEPSEEK,
|
| 149 |
+
api_key=api_key,
|
| 150 |
+
model_name=model_name,
|
| 151 |
+
base_url=base_url or DEEPSEEK_DEFAULT_BASE_URL,
|
| 152 |
+
**kwargs
|
| 153 |
+
)
|
| 154 |
+
return DeepSeekProvider(config)
|
app/utils/llm_providers/factory.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/utils/llm_providers/factory.py
|
| 2 |
+
"""
|
| 3 |
+
LLM 工厂模块
|
| 4 |
+
|
| 5 |
+
提供统一的 LLM 客户端创建接口,根据配置自动选择合适的供应商。
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
from .base import BaseLLMProvider, LLMConfig, LLMProviderType
|
| 12 |
+
from .openai_provider import OpenAIProvider
|
| 13 |
+
from .deepseek_provider import DeepSeekProvider, DEEPSEEK_DEFAULT_BASE_URL
|
| 14 |
+
from .anthropic_provider import AnthropicProvider
|
| 15 |
+
from .gemini_provider import GeminiProvider
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class LLMFactory:
|
| 19 |
+
"""
|
| 20 |
+
LLM 客户端工厂
|
| 21 |
+
|
| 22 |
+
根据提供商类型创建对应的客户端实例。
|
| 23 |
+
支持从环境变量自动配置。
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
# 提供商类到枚举的映射
|
| 27 |
+
_providers = {
|
| 28 |
+
LLMProviderType.OPENAI: OpenAIProvider,
|
| 29 |
+
LLMProviderType.DEEPSEEK: DeepSeekProvider,
|
| 30 |
+
LLMProviderType.ANTHROPIC: AnthropicProvider,
|
| 31 |
+
LLMProviderType.GEMINI: GeminiProvider,
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
# 默认模型名称映射
|
| 35 |
+
_default_models = {
|
| 36 |
+
LLMProviderType.OPENAI: "gpt-4o-mini",
|
| 37 |
+
LLMProviderType.DEEPSEEK: "deepseek-chat",
|
| 38 |
+
LLMProviderType.ANTHROPIC: "claude-3-5-sonnet-20241022",
|
| 39 |
+
LLMProviderType.GEMINI: "gemini-1.5-flash",
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# 默认 Base URL 映射
|
| 43 |
+
_default_base_urls = {
|
| 44 |
+
LLMProviderType.OPENAI: None, # 使用 SDK 默认
|
| 45 |
+
LLMProviderType.DEEPSEEK: DEEPSEEK_DEFAULT_BASE_URL,
|
| 46 |
+
LLMProviderType.ANTHROPIC: None,
|
| 47 |
+
LLMProviderType.GEMINI: None,
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
@classmethod
|
| 51 |
+
def create(
|
| 52 |
+
cls,
|
| 53 |
+
provider: str,
|
| 54 |
+
api_key: str,
|
| 55 |
+
model_name: str = None,
|
| 56 |
+
base_url: str = None,
|
| 57 |
+
**kwargs
|
| 58 |
+
) -> Optional[BaseLLMProvider]:
|
| 59 |
+
"""
|
| 60 |
+
创建 LLM 客户端
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
provider: 提供商名称 ("openai", "deepseek", "anthropic", "gemini")
|
| 64 |
+
api_key: API Key
|
| 65 |
+
model_name: 模型名称 (可选,使用默认值)
|
| 66 |
+
base_url: 自定义 API 端点 (可选)
|
| 67 |
+
**kwargs: 其他配置参数
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
BaseLLMProvider 实例,或 None (如果创建失败)
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
# 解析提供商类型
|
| 74 |
+
provider_type = LLMProviderType(provider.lower())
|
| 75 |
+
except ValueError:
|
| 76 |
+
print(f"❌ 不支持的 LLM 提供商: {provider}")
|
| 77 |
+
print(f" 支持的提供商: {', '.join([p.value for p in LLMProviderType])}")
|
| 78 |
+
return None
|
| 79 |
+
|
| 80 |
+
if not api_key:
|
| 81 |
+
print(f"❌ 未提供 {provider} 的 API Key")
|
| 82 |
+
return None
|
| 83 |
+
|
| 84 |
+
# 获取提供商类
|
| 85 |
+
provider_class = cls._providers.get(provider_type)
|
| 86 |
+
if not provider_class:
|
| 87 |
+
print(f"❌ 提供商 {provider} 未实现")
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
# 构建配置
|
| 91 |
+
config = LLMConfig(
|
| 92 |
+
provider=provider_type,
|
| 93 |
+
api_key=api_key,
|
| 94 |
+
model_name=model_name or cls._default_models.get(provider_type, "default"),
|
| 95 |
+
base_url=base_url or cls._default_base_urls.get(provider_type),
|
| 96 |
+
**kwargs
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
client = provider_class(config)
|
| 101 |
+
if client.validate_connection():
|
| 102 |
+
print(f"✅ {provider.upper()} Client 初始化成功 (Model: {config.model_name})")
|
| 103 |
+
return client
|
| 104 |
+
else:
|
| 105 |
+
print(f"❌ {provider.upper()} Client 验证失败")
|
| 106 |
+
return None
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"❌ {provider.upper()} Client 初始化失败: {e}")
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
@classmethod
|
| 112 |
+
def create_from_env(cls, provider: str = None) -> Optional[BaseLLMProvider]:
|
| 113 |
+
"""
|
| 114 |
+
从环境变量创建 LLM 客户端
|
| 115 |
+
|
| 116 |
+
环境变量命名规范:
|
| 117 |
+
- LLM_PROVIDER: 提供商名称 (可被参数覆盖)
|
| 118 |
+
- {PROVIDER}_API_KEY: API Key (如 OPENAI_API_KEY, DEEPSEEK_API_KEY)
|
| 119 |
+
- {PROVIDER}_BASE_URL: 自定义端点 (可选)
|
| 120 |
+
- MODEL_NAME: 模型名称 (可选)
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
provider: 提供商名称 (可选,默认从 LLM_PROVIDER 环境变量读取)
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
BaseLLMProvider 实例
|
| 127 |
+
"""
|
| 128 |
+
# 确定提供商
|
| 129 |
+
_provider = provider or os.getenv("LLM_PROVIDER", "deepseek")
|
| 130 |
+
_provider = _provider.lower()
|
| 131 |
+
|
| 132 |
+
# 获取 API Key (支持多种命名方式)
|
| 133 |
+
key_env_names = [
|
| 134 |
+
f"{_provider.upper()}_API_KEY",
|
| 135 |
+
f"{_provider.upper()}API_KEY",
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
api_key = None
|
| 139 |
+
for key_name in key_env_names:
|
| 140 |
+
api_key = os.getenv(key_name)
|
| 141 |
+
if api_key:
|
| 142 |
+
break
|
| 143 |
+
|
| 144 |
+
if not api_key:
|
| 145 |
+
print(f"❌ 未找到 {_provider.upper()} API Key")
|
| 146 |
+
print(f" 请设置环境变量: {key_env_names[0]}")
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
+
# 获取可选配置
|
| 150 |
+
base_url = os.getenv(f"{_provider.upper()}_BASE_URL")
|
| 151 |
+
model_name = os.getenv("MODEL_NAME")
|
| 152 |
+
|
| 153 |
+
return cls.create(
|
| 154 |
+
provider=_provider,
|
| 155 |
+
api_key=api_key,
|
| 156 |
+
model_name=model_name,
|
| 157 |
+
base_url=base_url
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def get_llm_client(provider: str = None) -> Optional[BaseLLMProvider]:
|
| 162 |
+
"""
|
| 163 |
+
便捷函数:获取 LLM 客户端
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
provider: 提供商名称 (可选)
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
BaseLLMProvider 实例
|
| 170 |
+
"""
|
| 171 |
+
return LLMFactory.create_from_env(provider)
|
app/utils/llm_providers/gemini_provider.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/utils/llm_providers/gemini_provider.py
|
| 2 |
+
"""
|
| 3 |
+
Google Gemini LLM 提供商实现
|
| 4 |
+
|
| 5 |
+
支持模型: gemini-1.5-pro, gemini-1.5-flash, gemini-pro 等
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import uuid
|
| 9 |
+
import time
|
| 10 |
+
from typing import List, AsyncIterator
|
| 11 |
+
|
| 12 |
+
from .base import (
|
| 13 |
+
BaseLLMProvider,
|
| 14 |
+
LLMConfig,
|
| 15 |
+
LLMMessage,
|
| 16 |
+
LLMResponse,
|
| 17 |
+
LLMChoice,
|
| 18 |
+
LLMUsage,
|
| 19 |
+
LLMProviderType
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class GeminiProvider(BaseLLMProvider):
|
| 24 |
+
"""
|
| 25 |
+
Google Gemini API 提供商
|
| 26 |
+
|
| 27 |
+
支持两种方式:
|
| 28 |
+
1. 使用 google-generativeai SDK (原生)
|
| 29 |
+
2. 使用 OpenAI 兼容接口 (通过 AI Studio 或 Vertex AI)
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, config: LLMConfig):
|
| 33 |
+
super().__init__(config)
|
| 34 |
+
self._available = False
|
| 35 |
+
self._use_openai_compat = config.base_url is not None
|
| 36 |
+
|
| 37 |
+
if self._use_openai_compat:
|
| 38 |
+
# 使用 OpenAI 兼容模式 (推荐)
|
| 39 |
+
try:
|
| 40 |
+
from openai import AsyncOpenAI
|
| 41 |
+
self._client = AsyncOpenAI(
|
| 42 |
+
api_key=config.api_key,
|
| 43 |
+
base_url=config.base_url,
|
| 44 |
+
timeout=config.timeout
|
| 45 |
+
)
|
| 46 |
+
self._available = True
|
| 47 |
+
print(f"✅ Gemini Provider (OpenAI Compatible) initialized")
|
| 48 |
+
except ImportError:
|
| 49 |
+
print("⚠️ openai 包未安装")
|
| 50 |
+
else:
|
| 51 |
+
# 使用 Google AI SDK (原生模式)
|
| 52 |
+
try:
|
| 53 |
+
import google.generativeai as genai
|
| 54 |
+
genai.configure(api_key=config.api_key)
|
| 55 |
+
self._genai = genai
|
| 56 |
+
self._model = genai.GenerativeModel(config.model_name)
|
| 57 |
+
self._available = True
|
| 58 |
+
print(f"✅ Gemini Provider (Native SDK) initialized")
|
| 59 |
+
except ImportError:
|
| 60 |
+
print("⚠️ google-generativeai 包未安装,请运行: pip install google-generativeai")
|
| 61 |
+
self._genai = None
|
| 62 |
+
self._model = None
|
| 63 |
+
|
| 64 |
+
def _convert_messages_to_gemini(self, messages: List[LLMMessage]) -> tuple:
|
| 65 |
+
"""
|
| 66 |
+
转换消息格式为 Gemini 格式
|
| 67 |
+
|
| 68 |
+
Gemini 的消息格式:
|
| 69 |
+
- 不支持 system 角色,需要将其合并到第一条 user 消息
|
| 70 |
+
- role: "user" | "model" (不是 "assistant")
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
(history, current_message)
|
| 74 |
+
"""
|
| 75 |
+
system_content = ""
|
| 76 |
+
converted = []
|
| 77 |
+
|
| 78 |
+
for msg in messages:
|
| 79 |
+
if msg.role == "system":
|
| 80 |
+
system_content = msg.content + "\n\n"
|
| 81 |
+
elif msg.role == "assistant":
|
| 82 |
+
converted.append({"role": "model", "parts": [msg.content]})
|
| 83 |
+
else: # user
|
| 84 |
+
content = msg.content
|
| 85 |
+
if system_content and len(converted) == 0:
|
| 86 |
+
content = system_content + content
|
| 87 |
+
system_content = ""
|
| 88 |
+
converted.append({"role": "user", "parts": [content]})
|
| 89 |
+
|
| 90 |
+
if not converted:
|
| 91 |
+
return [], ""
|
| 92 |
+
|
| 93 |
+
# 最后一条作为当前消息
|
| 94 |
+
if len(converted) == 1:
|
| 95 |
+
return [], converted[0]["parts"][0]
|
| 96 |
+
else:
|
| 97 |
+
return converted[:-1], converted[-1]["parts"][0]
|
| 98 |
+
|
| 99 |
+
async def chat_completions_create(
|
| 100 |
+
self,
|
| 101 |
+
messages: List[LLMMessage],
|
| 102 |
+
model: str,
|
| 103 |
+
temperature: float,
|
| 104 |
+
max_tokens: int,
|
| 105 |
+
timeout: int,
|
| 106 |
+
**kwargs
|
| 107 |
+
) -> LLMResponse:
|
| 108 |
+
"""非流式请求"""
|
| 109 |
+
if not self._available:
|
| 110 |
+
raise RuntimeError("Gemini client not available")
|
| 111 |
+
|
| 112 |
+
if self._use_openai_compat:
|
| 113 |
+
# OpenAI 兼容模式
|
| 114 |
+
api_messages = [
|
| 115 |
+
{"role": m.role, "content": m.content}
|
| 116 |
+
for m in messages
|
| 117 |
+
]
|
| 118 |
+
|
| 119 |
+
response = await self._client.chat.completions.create(
|
| 120 |
+
model=model,
|
| 121 |
+
messages=api_messages,
|
| 122 |
+
temperature=temperature,
|
| 123 |
+
max_tokens=max_tokens,
|
| 124 |
+
timeout=timeout,
|
| 125 |
+
**kwargs
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
choices = [
|
| 129 |
+
LLMChoice(
|
| 130 |
+
index=c.index,
|
| 131 |
+
message=LLMMessage(role=c.message.role, content=c.message.content),
|
| 132 |
+
finish_reason=c.finish_reason
|
| 133 |
+
)
|
| 134 |
+
for c in response.choices
|
| 135 |
+
]
|
| 136 |
+
|
| 137 |
+
usage = None
|
| 138 |
+
if response.usage:
|
| 139 |
+
usage = LLMUsage(
|
| 140 |
+
prompt_tokens=response.usage.prompt_tokens,
|
| 141 |
+
completion_tokens=response.usage.completion_tokens,
|
| 142 |
+
total_tokens=response.usage.total_tokens
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
return LLMResponse(
|
| 146 |
+
id=response.id,
|
| 147 |
+
model=response.model,
|
| 148 |
+
choices=choices,
|
| 149 |
+
usage=usage,
|
| 150 |
+
created=response.created
|
| 151 |
+
)
|
| 152 |
+
else:
|
| 153 |
+
# Native SDK 模式
|
| 154 |
+
history, current_msg = self._convert_messages_to_gemini(messages)
|
| 155 |
+
|
| 156 |
+
generation_config = {
|
| 157 |
+
"temperature": temperature,
|
| 158 |
+
"max_output_tokens": max_tokens,
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
chat = self._model.start_chat(history=history)
|
| 162 |
+
response = await chat.send_message_async(
|
| 163 |
+
current_msg,
|
| 164 |
+
generation_config=generation_config
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
content = response.text if response.text else ""
|
| 168 |
+
|
| 169 |
+
choices = [
|
| 170 |
+
LLMChoice(
|
| 171 |
+
index=0,
|
| 172 |
+
message=LLMMessage(role="assistant", content=content),
|
| 173 |
+
finish_reason="stop"
|
| 174 |
+
)
|
| 175 |
+
]
|
| 176 |
+
|
| 177 |
+
# Gemini 原生 SDK 的 token 统计
|
| 178 |
+
usage = None
|
| 179 |
+
if hasattr(response, 'usage_metadata') and response.usage_metadata:
|
| 180 |
+
usage = LLMUsage(
|
| 181 |
+
prompt_tokens=getattr(response.usage_metadata, 'prompt_token_count', 0),
|
| 182 |
+
completion_tokens=getattr(response.usage_metadata, 'candidates_token_count', 0),
|
| 183 |
+
total_tokens=getattr(response.usage_metadata, 'total_token_count', 0)
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
return LLMResponse(
|
| 187 |
+
id=f"gemini-{uuid.uuid4().hex[:12]}",
|
| 188 |
+
model=model,
|
| 189 |
+
choices=choices,
|
| 190 |
+
usage=usage,
|
| 191 |
+
created=int(time.time())
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
async def chat_completions_create_stream(
|
| 195 |
+
self,
|
| 196 |
+
messages: List[LLMMessage],
|
| 197 |
+
model: str,
|
| 198 |
+
temperature: float,
|
| 199 |
+
max_tokens: int,
|
| 200 |
+
timeout: int,
|
| 201 |
+
**kwargs
|
| 202 |
+
) -> AsyncIterator[LLMResponse]:
|
| 203 |
+
"""流式请求"""
|
| 204 |
+
if not self._available:
|
| 205 |
+
raise RuntimeError("Gemini client not available")
|
| 206 |
+
|
| 207 |
+
if self._use_openai_compat:
|
| 208 |
+
# OpenAI 兼容模式
|
| 209 |
+
api_messages = [
|
| 210 |
+
{"role": m.role, "content": m.content}
|
| 211 |
+
for m in messages
|
| 212 |
+
]
|
| 213 |
+
|
| 214 |
+
stream = await self._client.chat.completions.create(
|
| 215 |
+
model=model,
|
| 216 |
+
messages=api_messages,
|
| 217 |
+
temperature=temperature,
|
| 218 |
+
max_tokens=max_tokens,
|
| 219 |
+
timeout=timeout,
|
| 220 |
+
stream=True,
|
| 221 |
+
**kwargs
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
async for chunk in stream:
|
| 225 |
+
if chunk.choices:
|
| 226 |
+
delta_content = chunk.choices[0].delta.content or ""
|
| 227 |
+
choices = [
|
| 228 |
+
LLMChoice(
|
| 229 |
+
index=0,
|
| 230 |
+
delta=LLMMessage(role="assistant", content=delta_content),
|
| 231 |
+
finish_reason=chunk.choices[0].finish_reason
|
| 232 |
+
)
|
| 233 |
+
]
|
| 234 |
+
yield LLMResponse(
|
| 235 |
+
id=chunk.id,
|
| 236 |
+
model=chunk.model,
|
| 237 |
+
choices=choices,
|
| 238 |
+
created=chunk.created
|
| 239 |
+
)
|
| 240 |
+
else:
|
| 241 |
+
# Native SDK 流式
|
| 242 |
+
history, current_msg = self._convert_messages_to_gemini(messages)
|
| 243 |
+
|
| 244 |
+
generation_config = {
|
| 245 |
+
"temperature": temperature,
|
| 246 |
+
"max_output_tokens": max_tokens,
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
chat = self._model.start_chat(history=history)
|
| 250 |
+
response = await chat.send_message_async(
|
| 251 |
+
current_msg,
|
| 252 |
+
generation_config=generation_config,
|
| 253 |
+
stream=True
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
response_id = f"gemini-{uuid.uuid4().hex[:12]}"
|
| 257 |
+
|
| 258 |
+
async for chunk in response:
|
| 259 |
+
if chunk.text:
|
| 260 |
+
choices = [
|
| 261 |
+
LLMChoice(
|
| 262 |
+
index=0,
|
| 263 |
+
delta=LLMMessage(role="assistant", content=chunk.text),
|
| 264 |
+
finish_reason=None
|
| 265 |
+
)
|
| 266 |
+
]
|
| 267 |
+
yield LLMResponse(
|
| 268 |
+
id=response_id,
|
| 269 |
+
model=model,
|
| 270 |
+
choices=choices,
|
| 271 |
+
created=int(time.time())
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
def validate_connection(self) -> bool:
|
| 275 |
+
"""验证连接"""
|
| 276 |
+
return self._available and bool(self.config.api_key)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def create_gemini_provider(
|
| 280 |
+
api_key: str,
|
| 281 |
+
model_name: str = "gemini-1.5-flash",
|
| 282 |
+
base_url: str = None,
|
| 283 |
+
**kwargs
|
| 284 |
+
) -> GeminiProvider:
|
| 285 |
+
"""
|
| 286 |
+
工厂函数:创建 Gemini 提供商
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
api_key: Google AI API Key
|
| 290 |
+
model_name: 模型名称
|
| 291 |
+
base_url: OpenAI 兼容端点 (可选)
|
| 292 |
+
如果不提供,则使用原生 SDK
|
| 293 |
+
"""
|
| 294 |
+
config = LLMConfig(
|
| 295 |
+
provider=LLMProviderType.GEMINI,
|
| 296 |
+
api_key=api_key,
|
| 297 |
+
model_name=model_name,
|
| 298 |
+
base_url=base_url,
|
| 299 |
+
**kwargs
|
| 300 |
+
)
|
| 301 |
+
return GeminiProvider(config)
|
app/utils/llm_providers/openai_provider.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/utils/llm_providers/openai_provider.py
|
| 2 |
+
"""
|
| 3 |
+
OpenAI LLM 提供商实现
|
| 4 |
+
|
| 5 |
+
支持模型: GPT-4, GPT-4o, GPT-4o-mini, GPT-3.5-turbo 等
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import List, AsyncIterator
|
| 9 |
+
from openai import AsyncOpenAI
|
| 10 |
+
|
| 11 |
+
from .base import (
|
| 12 |
+
BaseLLMProvider,
|
| 13 |
+
LLMConfig,
|
| 14 |
+
LLMMessage,
|
| 15 |
+
LLMResponse,
|
| 16 |
+
LLMChoice,
|
| 17 |
+
LLMUsage,
|
| 18 |
+
LLMProviderType
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class OpenAIProvider(BaseLLMProvider):
|
| 23 |
+
"""OpenAI API 提供商"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, config: LLMConfig):
|
| 26 |
+
super().__init__(config)
|
| 27 |
+
self._client = AsyncOpenAI(
|
| 28 |
+
api_key=config.api_key,
|
| 29 |
+
base_url=config.base_url, # 可选自定义 base_url
|
| 30 |
+
timeout=config.timeout
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
async def chat_completions_create(
|
| 34 |
+
self,
|
| 35 |
+
messages: List[LLMMessage],
|
| 36 |
+
model: str,
|
| 37 |
+
temperature: float,
|
| 38 |
+
max_tokens: int,
|
| 39 |
+
timeout: int,
|
| 40 |
+
**kwargs
|
| 41 |
+
) -> LLMResponse:
|
| 42 |
+
"""非流式请求"""
|
| 43 |
+
# 转换消息格式
|
| 44 |
+
api_messages = [
|
| 45 |
+
{"role": m.role, "content": m.content}
|
| 46 |
+
for m in messages
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
response = await self._client.chat.completions.create(
|
| 50 |
+
model=model,
|
| 51 |
+
messages=api_messages,
|
| 52 |
+
temperature=temperature,
|
| 53 |
+
max_tokens=max_tokens,
|
| 54 |
+
timeout=timeout,
|
| 55 |
+
**kwargs
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# 转换为统一格式
|
| 59 |
+
choices = [
|
| 60 |
+
LLMChoice(
|
| 61 |
+
index=c.index,
|
| 62 |
+
message=LLMMessage(role=c.message.role, content=c.message.content),
|
| 63 |
+
finish_reason=c.finish_reason
|
| 64 |
+
)
|
| 65 |
+
for c in response.choices
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
usage = None
|
| 69 |
+
if response.usage:
|
| 70 |
+
usage = LLMUsage(
|
| 71 |
+
prompt_tokens=response.usage.prompt_tokens,
|
| 72 |
+
completion_tokens=response.usage.completion_tokens,
|
| 73 |
+
total_tokens=response.usage.total_tokens
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
return LLMResponse(
|
| 77 |
+
id=response.id,
|
| 78 |
+
model=response.model,
|
| 79 |
+
choices=choices,
|
| 80 |
+
usage=usage,
|
| 81 |
+
created=response.created
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
async def chat_completions_create_stream(
|
| 85 |
+
self,
|
| 86 |
+
messages: List[LLMMessage],
|
| 87 |
+
model: str,
|
| 88 |
+
temperature: float,
|
| 89 |
+
max_tokens: int,
|
| 90 |
+
timeout: int,
|
| 91 |
+
**kwargs
|
| 92 |
+
) -> AsyncIterator[LLMResponse]:
|
| 93 |
+
"""流式请求"""
|
| 94 |
+
api_messages = [
|
| 95 |
+
{"role": m.role, "content": m.content}
|
| 96 |
+
for m in messages
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
stream = await self._client.chat.completions.create(
|
| 100 |
+
model=model,
|
| 101 |
+
messages=api_messages,
|
| 102 |
+
temperature=temperature,
|
| 103 |
+
max_tokens=max_tokens,
|
| 104 |
+
timeout=timeout,
|
| 105 |
+
stream=True,
|
| 106 |
+
**kwargs
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
async for chunk in stream:
|
| 110 |
+
if chunk.choices:
|
| 111 |
+
delta_content = chunk.choices[0].delta.content or ""
|
| 112 |
+
choices = [
|
| 113 |
+
LLMChoice(
|
| 114 |
+
index=0,
|
| 115 |
+
delta=LLMMessage(role="assistant", content=delta_content),
|
| 116 |
+
finish_reason=chunk.choices[0].finish_reason
|
| 117 |
+
)
|
| 118 |
+
]
|
| 119 |
+
yield LLMResponse(
|
| 120 |
+
id=chunk.id,
|
| 121 |
+
model=chunk.model,
|
| 122 |
+
choices=choices,
|
| 123 |
+
created=chunk.created
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
def validate_connection(self) -> bool:
|
| 127 |
+
"""验证 API Key 有效性"""
|
| 128 |
+
return bool(self.config.api_key)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def create_openai_provider(
|
| 132 |
+
api_key: str,
|
| 133 |
+
model_name: str = "gpt-4o-mini",
|
| 134 |
+
base_url: str = None,
|
| 135 |
+
**kwargs
|
| 136 |
+
) -> OpenAIProvider:
|
| 137 |
+
"""工厂函数:创建 OpenAI 提供商"""
|
| 138 |
+
config = LLMConfig(
|
| 139 |
+
provider=LLMProviderType.OPENAI,
|
| 140 |
+
api_key=api_key,
|
| 141 |
+
model_name=model_name,
|
| 142 |
+
base_url=base_url,
|
| 143 |
+
**kwargs
|
| 144 |
+
)
|
| 145 |
+
return OpenAIProvider(config)
|
app/utils/repo_lock.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
仓库级分布式锁
|
| 4 |
+
|
| 5 |
+
解决问题:
|
| 6 |
+
1. 同一仓库的并发写入竞争 (两人同时输入同一 URL)
|
| 7 |
+
2. 重新分析时的数据一致性 (用户 A 重分析,用户 B 同时查询)
|
| 8 |
+
|
| 9 |
+
设计原则:
|
| 10 |
+
- 单进程: asyncio.Lock (内存锁)
|
| 11 |
+
- 多进程: 文件锁 (fcntl/msvcrt)
|
| 12 |
+
- 多节点: 可选 Redis 分布式锁 (生产环境)
|
| 13 |
+
|
| 14 |
+
使用示例:
|
| 15 |
+
```python
|
| 16 |
+
async with RepoLock.acquire(session_id):
|
| 17 |
+
# 独占访问该仓库的写操作
|
| 18 |
+
await vector_store.reset()
|
| 19 |
+
await vector_store.add_documents(docs)
|
| 20 |
+
```
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
import asyncio
|
| 24 |
+
import logging
|
| 25 |
+
import os
|
| 26 |
+
import time
|
| 27 |
+
from abc import ABC, abstractmethod
|
| 28 |
+
from contextlib import asynccontextmanager
|
| 29 |
+
from dataclasses import dataclass
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from typing import Dict, Optional
|
| 32 |
+
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ============================================================
|
| 37 |
+
# 锁配置
|
| 38 |
+
# ============================================================
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class LockConfig:
|
| 42 |
+
"""锁配置"""
|
| 43 |
+
# 锁类型: "memory" | "file" | "redis"
|
| 44 |
+
backend: str = os.getenv("LOCK_BACKEND", "file")
|
| 45 |
+
|
| 46 |
+
# 文件锁目录
|
| 47 |
+
lock_dir: str = os.getenv("LOCK_DIR", "data/locks")
|
| 48 |
+
|
| 49 |
+
# Redis 配置 (可选)
|
| 50 |
+
redis_url: str = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
| 51 |
+
|
| 52 |
+
# 锁超时 (秒)
|
| 53 |
+
lock_timeout: float = float(os.getenv("LOCK_TIMEOUT", "300")) # 5分钟
|
| 54 |
+
|
| 55 |
+
# 等待超时 (秒)
|
| 56 |
+
acquire_timeout: float = float(os.getenv("LOCK_ACQUIRE_TIMEOUT", "60"))
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# ============================================================
|
| 60 |
+
# 锁后端抽象
|
| 61 |
+
# ============================================================
|
| 62 |
+
|
| 63 |
+
class LockBackend(ABC):
|
| 64 |
+
"""锁后端接口"""
|
| 65 |
+
|
| 66 |
+
@abstractmethod
|
| 67 |
+
async def acquire(self, key: str, timeout: float) -> bool:
|
| 68 |
+
"""获取锁"""
|
| 69 |
+
pass
|
| 70 |
+
|
| 71 |
+
@abstractmethod
|
| 72 |
+
async def release(self, key: str) -> None:
|
| 73 |
+
"""释放锁"""
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
@abstractmethod
|
| 77 |
+
async def is_locked(self, key: str) -> bool:
|
| 78 |
+
"""检查是否已锁定"""
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# ============================================================
|
| 83 |
+
# 内存锁 (单进程)
|
| 84 |
+
# ============================================================
|
| 85 |
+
|
| 86 |
+
class MemoryLockBackend(LockBackend):
|
| 87 |
+
"""
|
| 88 |
+
内存锁后端 (asyncio.Lock)
|
| 89 |
+
|
| 90 |
+
适用于: 单 Worker 部署
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
def __init__(self):
|
| 94 |
+
self._locks: Dict[str, asyncio.Lock] = {}
|
| 95 |
+
self._meta_lock = asyncio.Lock()
|
| 96 |
+
|
| 97 |
+
async def _get_lock(self, key: str) -> asyncio.Lock:
|
| 98 |
+
async with self._meta_lock:
|
| 99 |
+
if key not in self._locks:
|
| 100 |
+
self._locks[key] = asyncio.Lock()
|
| 101 |
+
return self._locks[key]
|
| 102 |
+
|
| 103 |
+
async def acquire(self, key: str, timeout: float) -> bool:
|
| 104 |
+
lock = await self._get_lock(key)
|
| 105 |
+
try:
|
| 106 |
+
await asyncio.wait_for(lock.acquire(), timeout=timeout)
|
| 107 |
+
return True
|
| 108 |
+
except asyncio.TimeoutError:
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
async def release(self, key: str) -> None:
|
| 112 |
+
if key in self._locks:
|
| 113 |
+
lock = self._locks[key]
|
| 114 |
+
if lock.locked():
|
| 115 |
+
lock.release()
|
| 116 |
+
|
| 117 |
+
async def is_locked(self, key: str) -> bool:
|
| 118 |
+
if key not in self._locks:
|
| 119 |
+
return False
|
| 120 |
+
return self._locks[key].locked()
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# ============================================================
|
| 124 |
+
# 文件锁 (多进程,单节点)
|
| 125 |
+
# ============================================================
|
| 126 |
+
|
| 127 |
+
class FileLockBackend(LockBackend):
|
| 128 |
+
"""
|
| 129 |
+
文件锁后端
|
| 130 |
+
|
| 131 |
+
适用于: 多 Worker 单节点部署 (Gunicorn + Qdrant Server)
|
| 132 |
+
|
| 133 |
+
实现:
|
| 134 |
+
- Windows: msvcrt.locking
|
| 135 |
+
- Unix: fcntl.flock
|
| 136 |
+
"""
|
| 137 |
+
|
| 138 |
+
def __init__(self, lock_dir: str):
|
| 139 |
+
self._lock_dir = Path(lock_dir)
|
| 140 |
+
self._lock_dir.mkdir(parents=True, exist_ok=True)
|
| 141 |
+
self._handles: Dict[str, object] = {}
|
| 142 |
+
self._memory_locks: Dict[str, asyncio.Lock] = {}
|
| 143 |
+
self._meta_lock = asyncio.Lock()
|
| 144 |
+
|
| 145 |
+
def _get_lock_path(self, key: str) -> Path:
|
| 146 |
+
# 清理 key,避免路径注入
|
| 147 |
+
safe_key = "".join(c if c.isalnum() or c in "_-" else "_" for c in key)
|
| 148 |
+
return self._lock_dir / f"{safe_key}.lock"
|
| 149 |
+
|
| 150 |
+
async def _get_memory_lock(self, key: str) -> asyncio.Lock:
|
| 151 |
+
"""同进程内的内存锁,防止同一进程内多个协程竞争文件锁"""
|
| 152 |
+
async with self._meta_lock:
|
| 153 |
+
if key not in self._memory_locks:
|
| 154 |
+
self._memory_locks[key] = asyncio.Lock()
|
| 155 |
+
return self._memory_locks[key]
|
| 156 |
+
|
| 157 |
+
async def acquire(self, key: str, timeout: float) -> bool:
|
| 158 |
+
# 先获取内存锁
|
| 159 |
+
mem_lock = await self._get_memory_lock(key)
|
| 160 |
+
try:
|
| 161 |
+
await asyncio.wait_for(mem_lock.acquire(), timeout=timeout)
|
| 162 |
+
except asyncio.TimeoutError:
|
| 163 |
+
return False
|
| 164 |
+
|
| 165 |
+
# 再获取文件锁
|
| 166 |
+
lock_path = self._get_lock_path(key)
|
| 167 |
+
start_time = time.time()
|
| 168 |
+
|
| 169 |
+
while time.time() - start_time < timeout:
|
| 170 |
+
try:
|
| 171 |
+
# 尝试获取文件锁
|
| 172 |
+
handle = open(lock_path, 'w')
|
| 173 |
+
|
| 174 |
+
if os.name == 'nt':
|
| 175 |
+
# Windows
|
| 176 |
+
import msvcrt
|
| 177 |
+
msvcrt.locking(handle.fileno(), msvcrt.LK_NBLCK, 1)
|
| 178 |
+
else:
|
| 179 |
+
# Unix
|
| 180 |
+
import fcntl
|
| 181 |
+
fcntl.flock(handle.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
| 182 |
+
|
| 183 |
+
self._handles[key] = handle
|
| 184 |
+
logger.debug(f"🔒 文件锁获取成功: {key}")
|
| 185 |
+
return True
|
| 186 |
+
|
| 187 |
+
except (IOError, OSError):
|
| 188 |
+
# 锁被占用,等待后重试
|
| 189 |
+
if 'handle' in dir() and handle:
|
| 190 |
+
handle.close()
|
| 191 |
+
await asyncio.sleep(0.1)
|
| 192 |
+
|
| 193 |
+
# 超时,释放内存锁
|
| 194 |
+
mem_lock.release()
|
| 195 |
+
logger.warning(f"⏰ 文件锁获取超时: {key}")
|
| 196 |
+
return False
|
| 197 |
+
|
| 198 |
+
async def release(self, key: str) -> None:
|
| 199 |
+
if key in self._handles:
|
| 200 |
+
handle = self._handles.pop(key)
|
| 201 |
+
try:
|
| 202 |
+
if os.name == 'nt':
|
| 203 |
+
import msvcrt
|
| 204 |
+
try:
|
| 205 |
+
msvcrt.locking(handle.fileno(), msvcrt.LK_UNLCK, 1)
|
| 206 |
+
except:
|
| 207 |
+
pass
|
| 208 |
+
else:
|
| 209 |
+
import fcntl
|
| 210 |
+
fcntl.flock(handle.fileno(), fcntl.LOCK_UN)
|
| 211 |
+
handle.close()
|
| 212 |
+
except:
|
| 213 |
+
pass
|
| 214 |
+
logger.debug(f"🔓 文件锁已释放: {key}")
|
| 215 |
+
|
| 216 |
+
# 释放内存锁
|
| 217 |
+
if key in self._memory_locks:
|
| 218 |
+
lock = self._memory_locks[key]
|
| 219 |
+
if lock.locked():
|
| 220 |
+
lock.release()
|
| 221 |
+
|
| 222 |
+
async def is_locked(self, key: str) -> bool:
|
| 223 |
+
lock_path = self._get_lock_path(key)
|
| 224 |
+
if not lock_path.exists():
|
| 225 |
+
return False
|
| 226 |
+
|
| 227 |
+
try:
|
| 228 |
+
handle = open(lock_path, 'w')
|
| 229 |
+
if os.name == 'nt':
|
| 230 |
+
import msvcrt
|
| 231 |
+
msvcrt.locking(handle.fileno(), msvcrt.LK_NBLCK, 1)
|
| 232 |
+
msvcrt.locking(handle.fileno(), msvcrt.LK_UNLCK, 1)
|
| 233 |
+
else:
|
| 234 |
+
import fcntl
|
| 235 |
+
fcntl.flock(handle.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
| 236 |
+
fcntl.flock(handle.fileno(), fcntl.LOCK_UN)
|
| 237 |
+
handle.close()
|
| 238 |
+
return False
|
| 239 |
+
except (IOError, OSError):
|
| 240 |
+
return True
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
# ============================================================
|
| 244 |
+
# Redis 锁 (分布式,多节点)
|
| 245 |
+
# ============================================================
|
| 246 |
+
|
| 247 |
+
class RedisLockBackend(LockBackend):
|
| 248 |
+
"""
|
| 249 |
+
Redis 分布式锁后端
|
| 250 |
+
|
| 251 |
+
适用于: 多节点部署 (K8s + Redis)
|
| 252 |
+
|
| 253 |
+
依赖: redis[hiredis]
|
| 254 |
+
"""
|
| 255 |
+
|
| 256 |
+
def __init__(self, redis_url: str, lock_timeout: float):
|
| 257 |
+
self._redis_url = redis_url
|
| 258 |
+
self._lock_timeout = lock_timeout
|
| 259 |
+
self._client = None
|
| 260 |
+
self._locks: Dict[str, object] = {}
|
| 261 |
+
|
| 262 |
+
async def _get_client(self):
|
| 263 |
+
if self._client is None:
|
| 264 |
+
try:
|
| 265 |
+
import redis.asyncio as aioredis
|
| 266 |
+
self._client = await aioredis.from_url(self._redis_url)
|
| 267 |
+
except ImportError:
|
| 268 |
+
raise RuntimeError(
|
| 269 |
+
"Redis 锁需要安装 redis 包: pip install redis[hiredis]"
|
| 270 |
+
)
|
| 271 |
+
return self._client
|
| 272 |
+
|
| 273 |
+
async def acquire(self, key: str, timeout: float) -> bool:
|
| 274 |
+
client = await self._get_client()
|
| 275 |
+
lock_key = f"repo_lock:{key}"
|
| 276 |
+
|
| 277 |
+
start_time = time.time()
|
| 278 |
+
while time.time() - start_time < timeout:
|
| 279 |
+
# 尝试设置锁
|
| 280 |
+
acquired = await client.set(
|
| 281 |
+
lock_key,
|
| 282 |
+
"locked",
|
| 283 |
+
nx=True,
|
| 284 |
+
ex=int(self._lock_timeout)
|
| 285 |
+
)
|
| 286 |
+
if acquired:
|
| 287 |
+
logger.debug(f"🔒 Redis 锁获取成功: {key}")
|
| 288 |
+
return True
|
| 289 |
+
await asyncio.sleep(0.1)
|
| 290 |
+
|
| 291 |
+
logger.warning(f"⏰ Redis 锁获取超时: {key}")
|
| 292 |
+
return False
|
| 293 |
+
|
| 294 |
+
async def release(self, key: str) -> None:
|
| 295 |
+
client = await self._get_client()
|
| 296 |
+
lock_key = f"repo_lock:{key}"
|
| 297 |
+
await client.delete(lock_key)
|
| 298 |
+
logger.debug(f"🔓 Redis 锁已释放: {key}")
|
| 299 |
+
|
| 300 |
+
async def is_locked(self, key: str) -> bool:
|
| 301 |
+
client = await self._get_client()
|
| 302 |
+
lock_key = f"repo_lock:{key}"
|
| 303 |
+
return await client.exists(lock_key) > 0
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
# ============================================================
|
| 307 |
+
# 统一锁接口
|
| 308 |
+
# ============================================================
|
| 309 |
+
|
| 310 |
+
class RepoLock:
|
| 311 |
+
"""
|
| 312 |
+
仓库级锁 - 统一接口
|
| 313 |
+
|
| 314 |
+
自动根据配置选择后端:
|
| 315 |
+
- memory: 单进程内存锁 (开发)
|
| 316 |
+
- file: 文件锁 (多进程单节点)
|
| 317 |
+
- redis: 分布式锁 (多节点)
|
| 318 |
+
|
| 319 |
+
使用:
|
| 320 |
+
```python
|
| 321 |
+
async with RepoLock.acquire(session_id):
|
| 322 |
+
# 独占写操作
|
| 323 |
+
await store.reset()
|
| 324 |
+
```
|
| 325 |
+
"""
|
| 326 |
+
|
| 327 |
+
_backend: Optional[LockBackend] = None
|
| 328 |
+
_config: Optional[LockConfig] = None
|
| 329 |
+
|
| 330 |
+
@classmethod
|
| 331 |
+
def _get_backend(cls) -> LockBackend:
|
| 332 |
+
if cls._backend is None:
|
| 333 |
+
cls._config = LockConfig()
|
| 334 |
+
|
| 335 |
+
if cls._config.backend == "redis":
|
| 336 |
+
cls._backend = RedisLockBackend(
|
| 337 |
+
cls._config.redis_url,
|
| 338 |
+
cls._config.lock_timeout
|
| 339 |
+
)
|
| 340 |
+
logger.info("🔐 使用 Redis 分布式锁")
|
| 341 |
+
elif cls._config.backend == "file":
|
| 342 |
+
cls._backend = FileLockBackend(cls._config.lock_dir)
|
| 343 |
+
logger.info(f"🔐 使用文件锁: {cls._config.lock_dir}")
|
| 344 |
+
else:
|
| 345 |
+
cls._backend = MemoryLockBackend()
|
| 346 |
+
logger.info("🔐 使用内存锁 (单进程)")
|
| 347 |
+
|
| 348 |
+
return cls._backend
|
| 349 |
+
|
| 350 |
+
@classmethod
|
| 351 |
+
@asynccontextmanager
|
| 352 |
+
async def acquire(cls, session_id: str, timeout: float = None):
|
| 353 |
+
"""
|
| 354 |
+
获取仓库写锁
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
session_id: 仓库的 session ID
|
| 358 |
+
timeout: 获取锁的超时时间 (默认从配置读取)
|
| 359 |
+
|
| 360 |
+
Raises:
|
| 361 |
+
TimeoutError: 获取锁超时
|
| 362 |
+
"""
|
| 363 |
+
backend = cls._get_backend()
|
| 364 |
+
config = cls._config or LockConfig()
|
| 365 |
+
wait_timeout = timeout or config.acquire_timeout
|
| 366 |
+
|
| 367 |
+
acquired = await backend.acquire(session_id, wait_timeout)
|
| 368 |
+
if not acquired:
|
| 369 |
+
raise TimeoutError(f"无法获取仓库锁: {session_id} (等待 {wait_timeout}s)")
|
| 370 |
+
|
| 371 |
+
try:
|
| 372 |
+
yield
|
| 373 |
+
finally:
|
| 374 |
+
await backend.release(session_id)
|
| 375 |
+
|
| 376 |
+
@classmethod
|
| 377 |
+
async def is_locked(cls, session_id: str) -> bool:
|
| 378 |
+
"""检查仓库是否被锁定"""
|
| 379 |
+
backend = cls._get_backend()
|
| 380 |
+
return await backend.is_locked(session_id)
|
| 381 |
+
|
| 382 |
+
@classmethod
|
| 383 |
+
async def try_acquire(cls, session_id: str, timeout: float = 0.1):
|
| 384 |
+
"""
|
| 385 |
+
尝试获取锁 (非阻塞)
|
| 386 |
+
|
| 387 |
+
用于检测是否有其他用户正在分析同一仓库
|
| 388 |
+
"""
|
| 389 |
+
backend = cls._get_backend()
|
| 390 |
+
return await backend.acquire(session_id, timeout)
|
app/utils/retry.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: app/utils/retry.py
|
| 2 |
+
"""
|
| 3 |
+
LLM 调用重试机制
|
| 4 |
+
|
| 5 |
+
使用 tenacity 库实现智能重试策略:
|
| 6 |
+
- 指数退避 (Exponential Backoff)
|
| 7 |
+
- 可重试异常识别
|
| 8 |
+
- 最大重试次数限制
|
| 9 |
+
- 详细日志记录
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import logging
|
| 13 |
+
from typing import Callable, Type, Tuple, Any
|
| 14 |
+
from functools import wraps
|
| 15 |
+
|
| 16 |
+
from tenacity import (
|
| 17 |
+
retry,
|
| 18 |
+
stop_after_attempt,
|
| 19 |
+
wait_exponential,
|
| 20 |
+
retry_if_exception_type,
|
| 21 |
+
before_sleep_log,
|
| 22 |
+
after_log,
|
| 23 |
+
RetryError,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# 配置日志
|
| 27 |
+
logger = logging.getLogger("llm_retry")
|
| 28 |
+
logger.setLevel(logging.INFO)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ============================================================================
|
| 32 |
+
# 可重试的异常类型定义
|
| 33 |
+
# ============================================================================
|
| 34 |
+
|
| 35 |
+
# 网络/临时性错误 - 应该重试
|
| 36 |
+
RETRYABLE_EXCEPTIONS: Tuple[Type[Exception], ...] = (
|
| 37 |
+
ConnectionError,
|
| 38 |
+
TimeoutError,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# 尝试导入各 SDK 的异常类型
|
| 42 |
+
try:
|
| 43 |
+
from openai import (
|
| 44 |
+
APIConnectionError,
|
| 45 |
+
APITimeoutError,
|
| 46 |
+
RateLimitError,
|
| 47 |
+
InternalServerError,
|
| 48 |
+
)
|
| 49 |
+
RETRYABLE_EXCEPTIONS = RETRYABLE_EXCEPTIONS + (
|
| 50 |
+
APIConnectionError,
|
| 51 |
+
APITimeoutError,
|
| 52 |
+
RateLimitError,
|
| 53 |
+
InternalServerError,
|
| 54 |
+
)
|
| 55 |
+
except ImportError:
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
try:
|
| 59 |
+
from anthropic import (
|
| 60 |
+
APIConnectionError as AnthropicConnectionError,
|
| 61 |
+
APITimeoutError as AnthropicTimeoutError,
|
| 62 |
+
RateLimitError as AnthropicRateLimitError,
|
| 63 |
+
InternalServerError as AnthropicServerError,
|
| 64 |
+
)
|
| 65 |
+
RETRYABLE_EXCEPTIONS = RETRYABLE_EXCEPTIONS + (
|
| 66 |
+
AnthropicConnectionError,
|
| 67 |
+
AnthropicTimeoutError,
|
| 68 |
+
AnthropicRateLimitError,
|
| 69 |
+
AnthropicServerError,
|
| 70 |
+
)
|
| 71 |
+
except ImportError:
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
import httpx
|
| 76 |
+
RETRYABLE_EXCEPTIONS = RETRYABLE_EXCEPTIONS + (
|
| 77 |
+
httpx.ConnectError,
|
| 78 |
+
httpx.ReadTimeout,
|
| 79 |
+
httpx.ConnectTimeout,
|
| 80 |
+
)
|
| 81 |
+
except ImportError:
|
| 82 |
+
pass
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# ============================================================================
|
| 86 |
+
# 重试配置
|
| 87 |
+
# ============================================================================
|
| 88 |
+
|
| 89 |
+
class RetryConfig:
|
| 90 |
+
"""重试配置"""
|
| 91 |
+
MAX_ATTEMPTS: int = 3 # 最大重试次数
|
| 92 |
+
MIN_WAIT_SECONDS: float = 1.0 # 最小等待时间
|
| 93 |
+
MAX_WAIT_SECONDS: float = 30.0 # 最大等待时间
|
| 94 |
+
EXPONENTIAL_MULTIPLIER: float = 2.0 # 指数退避乘数
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ============================================================================
|
| 98 |
+
# 重试装饰器
|
| 99 |
+
# ============================================================================
|
| 100 |
+
|
| 101 |
+
def create_retry_decorator(
|
| 102 |
+
max_attempts: int = RetryConfig.MAX_ATTEMPTS,
|
| 103 |
+
min_wait: float = RetryConfig.MIN_WAIT_SECONDS,
|
| 104 |
+
max_wait: float = RetryConfig.MAX_WAIT_SECONDS,
|
| 105 |
+
):
|
| 106 |
+
"""
|
| 107 |
+
创建 LLM 调用重试装饰器
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
max_attempts: 最大重试次数
|
| 111 |
+
min_wait: 最小等待时间 (秒)
|
| 112 |
+
max_wait: 最大等待时间 (秒)
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
tenacity retry 装饰器
|
| 116 |
+
"""
|
| 117 |
+
return retry(
|
| 118 |
+
# 重试条件: 仅对可重试异常进行重试
|
| 119 |
+
retry=retry_if_exception_type(RETRYABLE_EXCEPTIONS),
|
| 120 |
+
# 停止条件: 达到最大重试次数
|
| 121 |
+
stop=stop_after_attempt(max_attempts),
|
| 122 |
+
# 等待策略: 指数退避
|
| 123 |
+
wait=wait_exponential(
|
| 124 |
+
multiplier=RetryConfig.EXPONENTIAL_MULTIPLIER,
|
| 125 |
+
min=min_wait,
|
| 126 |
+
max=max_wait,
|
| 127 |
+
),
|
| 128 |
+
# 日志: 重试前记录
|
| 129 |
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
| 130 |
+
# 日志: 重试后记录
|
| 131 |
+
after=after_log(logger, logging.DEBUG),
|
| 132 |
+
# 重新抛出最后一个异常
|
| 133 |
+
reraise=True,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# 默认的重试装饰器实例
|
| 138 |
+
llm_retry = create_retry_decorator()
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def with_retry(func: Callable) -> Callable:
|
| 142 |
+
"""
|
| 143 |
+
为异步函数添加重试能力的装饰器
|
| 144 |
+
|
| 145 |
+
Usage:
|
| 146 |
+
@with_retry
|
| 147 |
+
async def call_llm(...):
|
| 148 |
+
...
|
| 149 |
+
"""
|
| 150 |
+
@wraps(func)
|
| 151 |
+
async def wrapper(*args, **kwargs):
|
| 152 |
+
@llm_retry
|
| 153 |
+
async def _inner():
|
| 154 |
+
return await func(*args, **kwargs)
|
| 155 |
+
return await _inner()
|
| 156 |
+
return wrapper
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# ============================================================================
|
| 160 |
+
# 便捷函数
|
| 161 |
+
# ============================================================================
|
| 162 |
+
|
| 163 |
+
async def retry_async(
|
| 164 |
+
coro_func: Callable,
|
| 165 |
+
*args,
|
| 166 |
+
max_attempts: int = RetryConfig.MAX_ATTEMPTS,
|
| 167 |
+
**kwargs
|
| 168 |
+
) -> Any:
|
| 169 |
+
"""
|
| 170 |
+
带重试的异步调用
|
| 171 |
+
|
| 172 |
+
Usage:
|
| 173 |
+
result = await retry_async(
|
| 174 |
+
client.chat.completions.create,
|
| 175 |
+
model="gpt-4",
|
| 176 |
+
messages=[...]
|
| 177 |
+
)
|
| 178 |
+
"""
|
| 179 |
+
decorator = create_retry_decorator(max_attempts=max_attempts)
|
| 180 |
+
|
| 181 |
+
@decorator
|
| 182 |
+
async def _call():
|
| 183 |
+
return await coro_func(*args, **kwargs)
|
| 184 |
+
|
| 185 |
+
return await _call()
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def is_retryable_error(error: Exception) -> bool:
|
| 189 |
+
"""判断异常是否可重试"""
|
| 190 |
+
return isinstance(error, RETRYABLE_EXCEPTIONS)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def log_retry_info(attempt: int, max_attempts: int, error: Exception, wait_time: float):
|
| 194 |
+
"""记录重试信息的辅助函数"""
|
| 195 |
+
logger.warning(
|
| 196 |
+
f"🔄 LLM 调用失败 (尝试 {attempt}/{max_attempts}): {type(error).__name__}: {error}. "
|
| 197 |
+
f"等待 {wait_time:.1f}s 后重试..."
|
| 198 |
+
)
|
app/utils/session.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Session 工具模块
|
| 4 |
+
|
| 5 |
+
提供基于仓库 URL 的 Session ID 生成和管理
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import hashlib
|
| 9 |
+
import re
|
| 10 |
+
from typing import Optional, Tuple, Dict
|
| 11 |
+
from urllib.parse import urlparse
|
| 12 |
+
|
| 13 |
+
from app.core.config import conversation_config
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def normalize_repo_url(url: str) -> str:
|
| 17 |
+
"""
|
| 18 |
+
标准化 GitHub 仓库 URL
|
| 19 |
+
|
| 20 |
+
支持格式:
|
| 21 |
+
- https://github.com/owner/repo
|
| 22 |
+
- https://github.com/owner/repo.git
|
| 23 |
+
- https://github.com/owner/repo/tree/main
|
| 24 |
+
- git@github.com:owner/repo.git
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
标准化的 URL: https://github.com/owner/repo (全小写)
|
| 28 |
+
"""
|
| 29 |
+
url = url.strip().lower() # 统一转为小写
|
| 30 |
+
|
| 31 |
+
# 处理 SSH 格式
|
| 32 |
+
if url.startswith('git@'):
|
| 33 |
+
# git@github.com:owner/repo.git -> https://github.com/owner/repo
|
| 34 |
+
match = re.match(r'git@github\.com:(.+?)(?:\.git)?$', url)
|
| 35 |
+
if match:
|
| 36 |
+
return f"https://github.com/{match.group(1)}"
|
| 37 |
+
|
| 38 |
+
# 处理 HTTPS 格式
|
| 39 |
+
parsed = urlparse(url)
|
| 40 |
+
path = parsed.path.strip('/')
|
| 41 |
+
|
| 42 |
+
# 移除 .git 后缀
|
| 43 |
+
if path.endswith('.git'):
|
| 44 |
+
path = path[:-4]
|
| 45 |
+
|
| 46 |
+
# 只保留 owner/repo 部分
|
| 47 |
+
parts = path.split('/')
|
| 48 |
+
if len(parts) >= 2:
|
| 49 |
+
path = f"{parts[0]}/{parts[1]}"
|
| 50 |
+
|
| 51 |
+
return f"https://github.com/{path}"
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def extract_repo_info(url: str) -> Tuple[str, str]:
|
| 55 |
+
"""
|
| 56 |
+
从 URL 提取仓库信息
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
(owner, repo) 元组
|
| 60 |
+
"""
|
| 61 |
+
normalized = normalize_repo_url(url)
|
| 62 |
+
path = urlparse(normalized).path.strip('/')
|
| 63 |
+
parts = path.split('/')
|
| 64 |
+
|
| 65 |
+
if len(parts) >= 2:
|
| 66 |
+
return parts[0], parts[1]
|
| 67 |
+
return "", ""
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def generate_repo_session_id(repo_url: str) -> str:
|
| 71 |
+
"""
|
| 72 |
+
基于仓库 URL 生成稳定的 Session ID
|
| 73 |
+
|
| 74 |
+
同一仓库 URL -> 同一 Session ID
|
| 75 |
+
|
| 76 |
+
格式: repo_{short_hash}_{owner}_{repo}
|
| 77 |
+
"""
|
| 78 |
+
normalized = normalize_repo_url(repo_url)
|
| 79 |
+
owner, repo = extract_repo_info(repo_url)
|
| 80 |
+
|
| 81 |
+
# 生成短 hash (8 字符)
|
| 82 |
+
url_hash = hashlib.sha256(normalized.encode()).hexdigest()[:8]
|
| 83 |
+
|
| 84 |
+
# 清理 owner 和 repo 名称
|
| 85 |
+
clean_owner = re.sub(r'[^a-zA-Z0-9]', '', owner)[:10]
|
| 86 |
+
clean_repo = re.sub(r'[^a-zA-Z0-9]', '', repo)[:15]
|
| 87 |
+
|
| 88 |
+
return f"repo_{url_hash}_{clean_owner}_{clean_repo}"
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def is_repo_session_id(session_id: str) -> bool:
|
| 92 |
+
"""判断是否为仓库级 Session ID"""
|
| 93 |
+
return session_id.startswith("repo_")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# === 对话历史管理 ===
|
| 97 |
+
|
| 98 |
+
class ConversationMemory:
|
| 99 |
+
"""
|
| 100 |
+
对话记忆管理 - 滑动窗口 + 摘要压缩
|
| 101 |
+
|
| 102 |
+
特性:
|
| 103 |
+
1. 保留最近 N 轮完整对话
|
| 104 |
+
2. 早期对话自动压缩为摘要
|
| 105 |
+
3. 支持 token 估算
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
def __init__(
|
| 109 |
+
self,
|
| 110 |
+
max_recent_turns: int = None,
|
| 111 |
+
max_context_tokens: int = None,
|
| 112 |
+
summary_threshold: int = None,
|
| 113 |
+
):
|
| 114 |
+
# 使用统一配置
|
| 115 |
+
self.max_recent_turns = max_recent_turns or conversation_config.max_recent_turns
|
| 116 |
+
self.max_context_tokens = max_context_tokens or conversation_config.max_context_tokens
|
| 117 |
+
self.summary_threshold = summary_threshold or conversation_config.summary_threshold
|
| 118 |
+
|
| 119 |
+
self._messages: list = [] # 完整消息历史
|
| 120 |
+
self._summary: Optional[str] = None # 早期对话摘要
|
| 121 |
+
self._summary_up_to: int = 0 # 摘要覆盖到第 N 条消息
|
| 122 |
+
|
| 123 |
+
def add_message(self, role: str, content: str) -> None:
|
| 124 |
+
"""添加消息"""
|
| 125 |
+
self._messages.append({
|
| 126 |
+
"role": role,
|
| 127 |
+
"content": content
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
def add_user_message(self, content: str) -> None:
|
| 131 |
+
"""添加用户消息"""
|
| 132 |
+
self.add_message("user", content)
|
| 133 |
+
|
| 134 |
+
def add_assistant_message(self, content: str) -> None:
|
| 135 |
+
"""添加助手消息"""
|
| 136 |
+
self.add_message("assistant", content)
|
| 137 |
+
|
| 138 |
+
def get_context_messages(self) -> list:
|
| 139 |
+
"""
|
| 140 |
+
获取用于 LLM 的上下文消息
|
| 141 |
+
|
| 142 |
+
策略:
|
| 143 |
+
1. 如果消息数 <= max_recent_turns * 2,返回全部
|
| 144 |
+
2. 否则返回: [摘要] + 最近 N 轮
|
| 145 |
+
"""
|
| 146 |
+
total_messages = len(self._messages)
|
| 147 |
+
max_messages = self.max_recent_turns * 2 # user + assistant = 1 轮
|
| 148 |
+
|
| 149 |
+
if total_messages <= max_messages:
|
| 150 |
+
return list(self._messages)
|
| 151 |
+
|
| 152 |
+
# 需要截断
|
| 153 |
+
recent_messages = self._messages[-max_messages:]
|
| 154 |
+
|
| 155 |
+
# 如果有摘要,加在前面
|
| 156 |
+
if self._summary:
|
| 157 |
+
return [
|
| 158 |
+
{"role": "system", "content": f"[Earlier conversation summary]\n{self._summary}"}
|
| 159 |
+
] + recent_messages
|
| 160 |
+
|
| 161 |
+
return recent_messages
|
| 162 |
+
|
| 163 |
+
def needs_summarization(self) -> bool:
|
| 164 |
+
"""检查是否需要生成摘要"""
|
| 165 |
+
unsummarized = len(self._messages) - self._summary_up_to
|
| 166 |
+
return unsummarized > self.summary_threshold * 2
|
| 167 |
+
|
| 168 |
+
def get_messages_to_summarize(self) -> list:
|
| 169 |
+
"""获取需要摘要的消息"""
|
| 170 |
+
if not self.needs_summarization():
|
| 171 |
+
return []
|
| 172 |
+
|
| 173 |
+
# 保留最近的,摘要早期的
|
| 174 |
+
end_idx = len(self._messages) - self.max_recent_turns * 2
|
| 175 |
+
return self._messages[self._summary_up_to:end_idx]
|
| 176 |
+
|
| 177 |
+
def set_summary(self, summary: str, up_to_index: int) -> None:
|
| 178 |
+
"""设置摘要"""
|
| 179 |
+
if self._summary:
|
| 180 |
+
# 合并旧摘要
|
| 181 |
+
self._summary = f"{self._summary}\n\n{summary}"
|
| 182 |
+
else:
|
| 183 |
+
self._summary = summary
|
| 184 |
+
self._summary_up_to = up_to_index
|
| 185 |
+
|
| 186 |
+
def clear(self) -> None:
|
| 187 |
+
"""清空对话历史"""
|
| 188 |
+
self._messages = []
|
| 189 |
+
self._summary = None
|
| 190 |
+
self._summary_up_to = 0
|
| 191 |
+
|
| 192 |
+
def get_turn_count(self) -> int:
|
| 193 |
+
"""获取对话轮数"""
|
| 194 |
+
return len(self._messages) // 2
|
| 195 |
+
|
| 196 |
+
def get_stats(self) -> dict:
|
| 197 |
+
"""获取统计信息"""
|
| 198 |
+
return {
|
| 199 |
+
"total_messages": len(self._messages),
|
| 200 |
+
"turn_count": self.get_turn_count(),
|
| 201 |
+
"has_summary": self._summary is not None,
|
| 202 |
+
"summary_covers": self._summary_up_to,
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# === 全局对话记忆存储 ===
|
| 207 |
+
# key: session_id, value: ConversationMemory
|
| 208 |
+
# 纯内存存储,服务重启自动清空
|
| 209 |
+
_conversation_memories: Dict[str, ConversationMemory] = {}
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def get_conversation_memory(session_id: str) -> ConversationMemory:
|
| 213 |
+
"""获取或创建对话记忆"""
|
| 214 |
+
if session_id not in _conversation_memories:
|
| 215 |
+
_conversation_memories[session_id] = ConversationMemory()
|
| 216 |
+
return _conversation_memories[session_id]
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def clear_conversation_memory(session_id: str) -> None:
|
| 220 |
+
"""清除对话记忆"""
|
| 221 |
+
if session_id in _conversation_memories:
|
| 222 |
+
del _conversation_memories[session_id]
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def get_memory_stats() -> dict:
|
| 226 |
+
"""获取对话记忆统计"""
|
| 227 |
+
return {
|
| 228 |
+
"total_memories": len(_conversation_memories),
|
| 229 |
+
"sessions": list(_conversation_memories.keys()),
|
| 230 |
+
}
|
deploy.sh
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# ============================================================
|
| 3 |
+
# GitHub RAG Agent - 生产环境部署脚本 (2核2G服务器优化版)
|
| 4 |
+
# ============================================================
|
| 5 |
+
#
|
| 6 |
+
# 使用方法:
|
| 7 |
+
# chmod +x deploy.sh
|
| 8 |
+
# ./deploy.sh
|
| 9 |
+
#
|
| 10 |
+
# 前置要求:
|
| 11 |
+
# - Python 3.10+
|
| 12 |
+
# - Docker (用于运行 Qdrant)
|
| 13 |
+
#
|
| 14 |
+
# ============================================================
|
| 15 |
+
|
| 16 |
+
set -e
|
| 17 |
+
|
| 18 |
+
echo "🚀 GitHub RAG Agent 部署脚本"
|
| 19 |
+
echo "=========================================="
|
| 20 |
+
|
| 21 |
+
# 检查是否在项目目录
|
| 22 |
+
if [ ! -f "requirements.txt" ]; then
|
| 23 |
+
echo "❌ 请在项目根目录运行此脚本"
|
| 24 |
+
exit 1
|
| 25 |
+
fi
|
| 26 |
+
|
| 27 |
+
# 检查 .env 文件
|
| 28 |
+
if [ ! -f ".env" ]; then
|
| 29 |
+
echo "❌ 未找到 .env 文件,请先复制 .env.example 并配置"
|
| 30 |
+
echo " cp .env.example .env"
|
| 31 |
+
echo " vim .env"
|
| 32 |
+
exit 1
|
| 33 |
+
fi
|
| 34 |
+
|
| 35 |
+
# ============================================================
|
| 36 |
+
# 1. 启动 Qdrant Server (Docker)
|
| 37 |
+
# ============================================================
|
| 38 |
+
echo ""
|
| 39 |
+
echo "📦 步骤 1: 启动 Qdrant Server..."
|
| 40 |
+
|
| 41 |
+
# 检查 Docker 是否运行
|
| 42 |
+
if ! docker info > /dev/null 2>&1; then
|
| 43 |
+
echo "❌ Docker 未运行,请先启动 Docker"
|
| 44 |
+
exit 1
|
| 45 |
+
fi
|
| 46 |
+
|
| 47 |
+
# 检查 Qdrant 容器是否已存在
|
| 48 |
+
if docker ps -a --format '{{.Names}}' | grep -q "^qdrant-server$"; then
|
| 49 |
+
echo " Qdrant 容器已存在,检查状态..."
|
| 50 |
+
if docker ps --format '{{.Names}}' | grep -q "^qdrant-server$"; then
|
| 51 |
+
echo " ✅ Qdrant 已在运行"
|
| 52 |
+
else
|
| 53 |
+
echo " 🔄 启动已有的 Qdrant 容器..."
|
| 54 |
+
docker start qdrant-server
|
| 55 |
+
fi
|
| 56 |
+
else
|
| 57 |
+
echo " 🆕 创建并启动 Qdrant 容器 (内存限制 512MB)..."
|
| 58 |
+
docker run -d \
|
| 59 |
+
--name qdrant-server \
|
| 60 |
+
--restart unless-stopped \
|
| 61 |
+
-p 6333:6333 \
|
| 62 |
+
-p 6334:6334 \
|
| 63 |
+
-v qdrant_data:/qdrant/storage \
|
| 64 |
+
-m 512m \
|
| 65 |
+
-e QDRANT__STORAGE__ON_DISK_PAYLOAD=true \
|
| 66 |
+
qdrant/qdrant:latest
|
| 67 |
+
fi
|
| 68 |
+
|
| 69 |
+
# 等待 Qdrant 就绪
|
| 70 |
+
echo " ⏳ 等待 Qdrant 就绪..."
|
| 71 |
+
for i in {1..30}; do
|
| 72 |
+
if curl -s http://localhost:6333/health > /dev/null 2>&1; then
|
| 73 |
+
echo " ✅ Qdrant 已就绪"
|
| 74 |
+
break
|
| 75 |
+
fi
|
| 76 |
+
sleep 1
|
| 77 |
+
done
|
| 78 |
+
|
| 79 |
+
# ============================================================
|
| 80 |
+
# 2. 创建 Python 虚拟环境
|
| 81 |
+
# ============================================================
|
| 82 |
+
echo ""
|
| 83 |
+
echo "🐍 步骤 2: 配置 Python 环境..."
|
| 84 |
+
|
| 85 |
+
if [ ! -d "venv" ]; then
|
| 86 |
+
echo " 创建虚拟环境..."
|
| 87 |
+
python3 -m venv venv
|
| 88 |
+
fi
|
| 89 |
+
|
| 90 |
+
echo " 激活虚拟环境..."
|
| 91 |
+
source venv/bin/activate
|
| 92 |
+
|
| 93 |
+
echo " 安装依赖..."
|
| 94 |
+
pip install -q --upgrade pip
|
| 95 |
+
pip install -q -r requirements.txt
|
| 96 |
+
|
| 97 |
+
# ============================================================
|
| 98 |
+
# 3. 创建必要目录
|
| 99 |
+
# ============================================================
|
| 100 |
+
echo ""
|
| 101 |
+
echo "📁 步骤 3: 创建数据目录..."
|
| 102 |
+
mkdir -p data/locks
|
| 103 |
+
mkdir -p data/contexts
|
| 104 |
+
mkdir -p logs
|
| 105 |
+
|
| 106 |
+
# ============================================================
|
| 107 |
+
# 4. 设置环境变量
|
| 108 |
+
# ============================================================
|
| 109 |
+
echo ""
|
| 110 |
+
echo "⚙️ 步骤 4: 配置环境变量..."
|
| 111 |
+
|
| 112 |
+
# 从 .env 加载
|
| 113 |
+
set -a
|
| 114 |
+
source .env
|
| 115 |
+
set +a
|
| 116 |
+
|
| 117 |
+
# 设置 Server 模式
|
| 118 |
+
export QDRANT_MODE=server
|
| 119 |
+
export QDRANT_URL=http://localhost:6333
|
| 120 |
+
export LOCK_BACKEND=file
|
| 121 |
+
export LOCK_DIR=data/locks
|
| 122 |
+
export GUNICORN_WORKERS=2
|
| 123 |
+
|
| 124 |
+
echo " QDRANT_MODE=$QDRANT_MODE"
|
| 125 |
+
echo " QDRANT_URL=$QDRANT_URL"
|
| 126 |
+
echo " GUNICORN_WORKERS=$GUNICORN_WORKERS"
|
| 127 |
+
|
| 128 |
+
# ============================================================
|
| 129 |
+
# 5. 启动应用
|
| 130 |
+
# ============================================================
|
| 131 |
+
echo ""
|
| 132 |
+
echo "🌐 步骤 5: 启动 FastAPI 应用..."
|
| 133 |
+
echo "=========================================="
|
| 134 |
+
echo " Workers: 2 (优化2核CPU)"
|
| 135 |
+
echo " 监听地址: 0.0.0.0:8000"
|
| 136 |
+
echo " Qdrant: http://localhost:6333"
|
| 137 |
+
echo "=========================================="
|
| 138 |
+
echo ""
|
| 139 |
+
echo " 按 Ctrl+C 停止服务"
|
| 140 |
+
echo ""
|
| 141 |
+
|
| 142 |
+
# 使用 Gunicorn 启动 (2 Workers)
|
| 143 |
+
gunicorn app.main:app -c gunicorn_conf.py
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Docker Compose 配置 - 生产环境部署 (优化版: 2核2G服务器)
|
| 2 |
+
# 包含: FastAPI 应用 + Qdrant Server
|
| 3 |
+
|
| 4 |
+
version: '3.8'
|
| 5 |
+
|
| 6 |
+
services:
|
| 7 |
+
# ============================================================
|
| 8 |
+
# Qdrant 向量数据库 (限制内存 512MB)
|
| 9 |
+
# ============================================================
|
| 10 |
+
qdrant:
|
| 11 |
+
image: qdrant/qdrant:latest
|
| 12 |
+
container_name: github-rag-qdrant
|
| 13 |
+
restart: unless-stopped
|
| 14 |
+
ports:
|
| 15 |
+
- "6333:6333" # REST API
|
| 16 |
+
- "6334:6334" # gRPC
|
| 17 |
+
volumes:
|
| 18 |
+
- qdrant_data:/qdrant/storage
|
| 19 |
+
environment:
|
| 20 |
+
- QDRANT__SERVICE__GRPC_PORT=6334
|
| 21 |
+
- QDRANT__STORAGE__ON_DISK_PAYLOAD=true # Payload 存磁盘,省内存
|
| 22 |
+
deploy:
|
| 23 |
+
resources:
|
| 24 |
+
limits:
|
| 25 |
+
memory: 512M
|
| 26 |
+
reservations:
|
| 27 |
+
memory: 256M
|
| 28 |
+
healthcheck:
|
| 29 |
+
test: ["CMD", "curl", "-f", "http://localhost:6333/health"]
|
| 30 |
+
interval: 30s
|
| 31 |
+
timeout: 10s
|
| 32 |
+
retries: 3
|
| 33 |
+
|
| 34 |
+
# ============================================================
|
| 35 |
+
# FastAPI 应用 (2 Workers, 限制内存 1GB)
|
| 36 |
+
# ============================================================
|
| 37 |
+
app:
|
| 38 |
+
build:
|
| 39 |
+
context: .
|
| 40 |
+
dockerfile: Dockerfile
|
| 41 |
+
container_name: github-rag-app
|
| 42 |
+
restart: unless-stopped
|
| 43 |
+
ports:
|
| 44 |
+
- "8000:8000"
|
| 45 |
+
environment:
|
| 46 |
+
# Qdrant Server 模式
|
| 47 |
+
- QDRANT_MODE=server
|
| 48 |
+
- QDRANT_URL=http://qdrant:6333
|
| 49 |
+
|
| 50 |
+
# Worker 数量 (2核服务器建议2个)
|
| 51 |
+
- GUNICORN_WORKERS=2
|
| 52 |
+
|
| 53 |
+
# 文件锁 (多 Worker)
|
| 54 |
+
- LOCK_BACKEND=file
|
| 55 |
+
- LOCK_DIR=/app/data/locks
|
| 56 |
+
|
| 57 |
+
# LLM 配置 (从 .env 读取)
|
| 58 |
+
- LLM_PROVIDER=${LLM_PROVIDER:-deepseek}
|
| 59 |
+
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY}
|
| 60 |
+
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
| 61 |
+
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
| 62 |
+
- GEMINI_API_KEY=${GEMINI_API_KEY}
|
| 63 |
+
- SILICON_API_KEY=${SILICON_API_KEY}
|
| 64 |
+
- GITHUB_TOKEN=${GITHUB_TOKEN}
|
| 65 |
+
volumes:
|
| 66 |
+
- app_data:/app/data
|
| 67 |
+
- app_logs:/app/logs
|
| 68 |
+
deploy:
|
| 69 |
+
resources:
|
| 70 |
+
limits:
|
| 71 |
+
memory: 1G
|
| 72 |
+
reservations:
|
| 73 |
+
memory: 512M
|
| 74 |
+
depends_on:
|
| 75 |
+
qdrant:
|
| 76 |
+
condition: service_healthy
|
| 77 |
+
healthcheck:
|
| 78 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
| 79 |
+
interval: 30s
|
| 80 |
+
timeout: 10s
|
| 81 |
+
retries: 3
|
| 82 |
+
|
| 83 |
+
volumes:
|
| 84 |
+
qdrant_data:
|
| 85 |
+
driver: local
|
| 86 |
+
app_data:
|
| 87 |
+
driver: local
|
| 88 |
+
app_logs:
|
| 89 |
+
driver: local
|
| 90 |
+
|
| 91 |
+
# ============================================================
|
| 92 |
+
# 使用说明
|
| 93 |
+
# ============================================================
|
| 94 |
+
# 1. 复制 .env.example 为 .env 并配置 API Keys
|
| 95 |
+
# 2. 启动服务: docker-compose up -d
|
| 96 |
+
# 3. 查看日志: docker-compose logs -f app
|
| 97 |
+
# 4. 停止服务: docker-compose down
|
| 98 |
+
#
|
| 99 |
+
# 扩展到多 Worker:
|
| 100 |
+
# 修改 Dockerfile 中的 gunicorn workers 数量,或使用:
|
| 101 |
+
# docker-compose up -d --scale app=3
|
| 102 |
+
# 配合 Nginx/Traefik 做负载均衡
|
evaluation/__init__.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# evaluation/__init__.py
|
| 2 |
+
"""
|
| 3 |
+
Evaluation 模块
|
| 4 |
+
|
| 5 |
+
提供完整的评估框架,包括:
|
| 6 |
+
- 数据模型 (models.py)
|
| 7 |
+
- 评估引擎 (evaluation_framework.py)
|
| 8 |
+
- 数据路由 (data_router.py)
|
| 9 |
+
- 工具函数 (utils.py)
|
| 10 |
+
- 数据分析 (analyze_eval_results.py)
|
| 11 |
+
- 数据清洗 (clean_and_export_sft_data.py)
|
| 12 |
+
|
| 13 |
+
使用示例:
|
| 14 |
+
from evaluation import EvaluationEngine, DataRoutingEngine, EvaluationResult
|
| 15 |
+
from evaluation.models import GenerationMetrics
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
# 核心导出
|
| 19 |
+
from evaluation.models import (
|
| 20 |
+
EvaluationLayer,
|
| 21 |
+
DataQualityTier,
|
| 22 |
+
QueryRewriteMetrics,
|
| 23 |
+
RetrievalMetrics,
|
| 24 |
+
GenerationMetrics,
|
| 25 |
+
AgenticMetrics,
|
| 26 |
+
EvaluationResult,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
from evaluation.data_router import DataRoutingEngine
|
| 30 |
+
from evaluation.evaluation_framework import EvaluationEngine
|
| 31 |
+
|
| 32 |
+
# 工具函数
|
| 33 |
+
from evaluation.utils import (
|
| 34 |
+
is_chatty_query,
|
| 35 |
+
has_code_indicators,
|
| 36 |
+
read_jsonl,
|
| 37 |
+
append_jsonl,
|
| 38 |
+
safe_truncate,
|
| 39 |
+
smart_truncate,
|
| 40 |
+
SFTLengthConfig,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
__all__ = [
|
| 44 |
+
# 枚举
|
| 45 |
+
"EvaluationLayer",
|
| 46 |
+
"DataQualityTier",
|
| 47 |
+
# 数据模型
|
| 48 |
+
"QueryRewriteMetrics",
|
| 49 |
+
"RetrievalMetrics",
|
| 50 |
+
"GenerationMetrics",
|
| 51 |
+
"AgenticMetrics",
|
| 52 |
+
"EvaluationResult",
|
| 53 |
+
# 引擎
|
| 54 |
+
"EvaluationEngine",
|
| 55 |
+
"DataRoutingEngine",
|
| 56 |
+
# 工具函数
|
| 57 |
+
"is_chatty_query",
|
| 58 |
+
"has_code_indicators",
|
| 59 |
+
"read_jsonl",
|
| 60 |
+
"append_jsonl",
|
| 61 |
+
"safe_truncate",
|
| 62 |
+
"smart_truncate",
|
| 63 |
+
"SFTLengthConfig",
|
| 64 |
+
]
|
evaluation/analyze_eval_results.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: evaluation/analyze_eval_results.py
|
| 2 |
+
"""
|
| 3 |
+
自动化数据分析脚本
|
| 4 |
+
用于分析评估结果,识别问题并生成诊断报告
|
| 5 |
+
|
| 6 |
+
核心功能:
|
| 7 |
+
1. 自动读取所有评估结果
|
| 8 |
+
2. 按问题类型分类 Bad Case
|
| 9 |
+
3. 生成可视化报告
|
| 10 |
+
4. 推荐优化方向
|
| 11 |
+
|
| 12 |
+
Author: Dexter
|
| 13 |
+
Date: 2025-01-27
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
from typing import Dict, List
|
| 18 |
+
from collections import Counter, defaultdict
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
|
| 21 |
+
from evaluation.utils import read_jsonl
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class EvaluationAnalyzer:
|
| 25 |
+
"""评估结果分析器"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, eval_results_file: str = "evaluation/sft_data/eval_results.jsonl"):
|
| 28 |
+
self.eval_results_file = eval_results_file
|
| 29 |
+
self.results: List[Dict] = read_jsonl(eval_results_file)
|
| 30 |
+
if not self.results:
|
| 31 |
+
print(f"⚠️ No results loaded from: {eval_results_file}")
|
| 32 |
+
|
| 33 |
+
def get_basic_stats(self) -> Dict:
|
| 34 |
+
"""获取基本统计"""
|
| 35 |
+
if not self.results:
|
| 36 |
+
return {}
|
| 37 |
+
|
| 38 |
+
scores = [r.get("overall_score", 0) for r in self.results]
|
| 39 |
+
tiers = [r.get("data_quality_tier", "unknown") for r in self.results]
|
| 40 |
+
|
| 41 |
+
return {
|
| 42 |
+
"total_evaluations": len(self.results),
|
| 43 |
+
"avg_score": sum(scores) / len(scores) if scores else 0,
|
| 44 |
+
"max_score": max(scores) if scores else 0,
|
| 45 |
+
"min_score": min(scores) if scores else 0,
|
| 46 |
+
"median_score": sorted(scores)[len(scores)//2] if scores else 0,
|
| 47 |
+
"quality_distribution": dict(Counter(tiers)),
|
| 48 |
+
"sft_ready_count": sum(1 for r in self.results if r.get("sft_ready", False))
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
def identify_bad_cases(self, threshold: float = 0.6) -> List[Dict]:
|
| 52 |
+
"""
|
| 53 |
+
识别 Bad Case (得分低于阈值的结果)
|
| 54 |
+
返回按得分排序的结果
|
| 55 |
+
"""
|
| 56 |
+
bad_cases = [r for r in self.results if r.get("overall_score", 1) < threshold]
|
| 57 |
+
return sorted(bad_cases, key=lambda x: x.get("overall_score", 1))
|
| 58 |
+
|
| 59 |
+
def categorize_failures(self) -> Dict[str, List[Dict]]:
|
| 60 |
+
"""
|
| 61 |
+
按失败原因分类 Bad Case
|
| 62 |
+
|
| 63 |
+
失败类型:
|
| 64 |
+
- retrieval_failure: 检索未命中
|
| 65 |
+
- generation_hallucination: 生成幻觉
|
| 66 |
+
- generation_incomplete: 回答不完整
|
| 67 |
+
- tool_call_error: 工具调用失败
|
| 68 |
+
"""
|
| 69 |
+
categorized = defaultdict(list)
|
| 70 |
+
|
| 71 |
+
for result in self.identify_bad_cases():
|
| 72 |
+
reasons = []
|
| 73 |
+
|
| 74 |
+
# 检查检索失败
|
| 75 |
+
if result.get("retrieval"):
|
| 76 |
+
retrieval = result["retrieval"]
|
| 77 |
+
if retrieval.get("hit_rate", 1) == 0:
|
| 78 |
+
reasons.append("retrieval_failure")
|
| 79 |
+
elif retrieval.get("recall_at_k", 1) < 0.5:
|
| 80 |
+
reasons.append("retrieval_low_recall")
|
| 81 |
+
|
| 82 |
+
# 检查生成问题
|
| 83 |
+
if result.get("generation"):
|
| 84 |
+
generation = result["generation"]
|
| 85 |
+
if generation.get("faithfulness", 1) < 0.5:
|
| 86 |
+
reasons.append("generation_hallucination")
|
| 87 |
+
if generation.get("answer_completeness", 1) < 0.4:
|
| 88 |
+
reasons.append("generation_incomplete")
|
| 89 |
+
if generation.get("hallucination_count", 0) > 0:
|
| 90 |
+
reasons.append("hallucination_detected")
|
| 91 |
+
|
| 92 |
+
# 检查Agent行为
|
| 93 |
+
if result.get("agentic"):
|
| 94 |
+
agentic = result["agentic"]
|
| 95 |
+
if not agentic.get("success", True):
|
| 96 |
+
reasons.append("agentic_failure")
|
| 97 |
+
|
| 98 |
+
# 如果没有具体原因,标记为unknown
|
| 99 |
+
if not reasons:
|
| 100 |
+
reasons.append("unknown")
|
| 101 |
+
|
| 102 |
+
for reason in reasons:
|
| 103 |
+
categorized[reason].append(result)
|
| 104 |
+
|
| 105 |
+
return dict(categorized)
|
| 106 |
+
|
| 107 |
+
def layer_performance(self) -> Dict[str, Dict]:
|
| 108 |
+
"""分析各层性能"""
|
| 109 |
+
layer_scores = defaultdict(list)
|
| 110 |
+
|
| 111 |
+
for result in self.results:
|
| 112 |
+
if result.get("query_rewrite"):
|
| 113 |
+
score = result["query_rewrite"].get("overall_score", 0)
|
| 114 |
+
if score:
|
| 115 |
+
layer_scores["query_rewrite"].append(score)
|
| 116 |
+
|
| 117 |
+
if result.get("retrieval"):
|
| 118 |
+
score = result["retrieval"].get("overall_score", 0)
|
| 119 |
+
if score:
|
| 120 |
+
layer_scores["retrieval"].append(score)
|
| 121 |
+
|
| 122 |
+
if result.get("generation"):
|
| 123 |
+
score = result["generation"].get("overall_score", 0)
|
| 124 |
+
if score:
|
| 125 |
+
layer_scores["generation"].append(score)
|
| 126 |
+
|
| 127 |
+
if result.get("agentic"):
|
| 128 |
+
score = result["agentic"].get("overall_score", 0)
|
| 129 |
+
if score:
|
| 130 |
+
layer_scores["agentic"].append(score)
|
| 131 |
+
|
| 132 |
+
# 计算每层的统计
|
| 133 |
+
layer_stats = {}
|
| 134 |
+
for layer, scores in layer_scores.items():
|
| 135 |
+
if scores:
|
| 136 |
+
layer_stats[layer] = {
|
| 137 |
+
"avg": sum(scores) / len(scores),
|
| 138 |
+
"min": min(scores),
|
| 139 |
+
"max": max(scores),
|
| 140 |
+
"count": len(scores)
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
return layer_stats
|
| 144 |
+
|
| 145 |
+
def get_recommendations(self) -> List[str]:
|
| 146 |
+
"""基于分析结果生成优化建议"""
|
| 147 |
+
recommendations = []
|
| 148 |
+
|
| 149 |
+
# 分析各层性能
|
| 150 |
+
layer_perf = self.layer_performance()
|
| 151 |
+
|
| 152 |
+
# 检索层分析
|
| 153 |
+
if "retrieval" in layer_perf:
|
| 154 |
+
retrieval_score = layer_perf["retrieval"]["avg"]
|
| 155 |
+
if retrieval_score < 0.7:
|
| 156 |
+
recommendations.append(
|
| 157 |
+
"🔴 RETRIEVAL 层性能差 (avg: {:.2f})\n"
|
| 158 |
+
" 建议:\n"
|
| 159 |
+
" 1. 检查 chunking 策略是否过度分割\n"
|
| 160 |
+
" 2. 优化 embedding 模型 (考虑更强的模型)\n"
|
| 161 |
+
" 3. 调整混合检索的权重 (BM25 vs Vector)\n"
|
| 162 |
+
" 4. 分析实际召回的文件,看是否与预期偏离".format(retrieval_score)
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# 生成层分析
|
| 166 |
+
if "generation" in layer_perf:
|
| 167 |
+
gen_score = layer_perf["generation"]["avg"]
|
| 168 |
+
if gen_score < 0.7:
|
| 169 |
+
recommendations.append(
|
| 170 |
+
"🟡 GENERATION 层存在问题 (avg: {:.2f})\n"
|
| 171 |
+
" 建议:\n"
|
| 172 |
+
" 1. 检查 Prompt 是否清晰 (可能LLM理解偏差)\n"
|
| 173 |
+
" 2. 检查是否存在幻觉 (生成不存在的函数名等)\n"
|
| 174 |
+
" 3. 优化 Context 的组织方式\n"
|
| 175 |
+
" 4. 考虑使用更强的LLM模型".format(gen_score)
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
# Query Rewrite 分析
|
| 179 |
+
if "query_rewrite" in layer_perf:
|
| 180 |
+
rewrite_score = layer_perf["query_rewrite"]["avg"]
|
| 181 |
+
if rewrite_score < 0.6:
|
| 182 |
+
recommendations.append(
|
| 183 |
+
"🟠 QUERY_REWRITE 层准确度低 (avg: {:.2f})\n"
|
| 184 |
+
" 建议:\n"
|
| 185 |
+
" 1. 优化关键词提取 Prompt\n"
|
| 186 |
+
" 2. 增加多语言处理支持\n"
|
| 187 |
+
" 3. 添加领域词汇表 (Domain Vocabulary)".format(rewrite_score)
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# 通用建议
|
| 191 |
+
stats = self.get_basic_stats()
|
| 192 |
+
if stats.get("sft_ready_count", 0) / max(stats.get("total_evaluations", 1), 1) < 0.5:
|
| 193 |
+
recommendations.append(
|
| 194 |
+
"⚠️ SFT 可用数据不足 (< 50%)\n"
|
| 195 |
+
" 立即行动:\n"
|
| 196 |
+
" 1. 运行 continuous_eval 脚本收集更多数据\n"
|
| 197 |
+
" 2. 对现有数据进行自纠正 (Self-Correction)\n"
|
| 198 |
+
" 3. 扩展黄金数据集来改进模型"
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
return recommendations
|
| 202 |
+
|
| 203 |
+
def generate_report(self, output_file: str = "evaluation/analysis_report.md") -> str:
|
| 204 |
+
"""生成完整的分析报告"""
|
| 205 |
+
|
| 206 |
+
report = []
|
| 207 |
+
report.append("# 📊 GitHub Agent 评估分析报告\n")
|
| 208 |
+
report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 209 |
+
report.append("---\n")
|
| 210 |
+
|
| 211 |
+
# 1. 基本统计
|
| 212 |
+
stats = self.get_basic_stats()
|
| 213 |
+
report.append("## 📈 基本统计\n")
|
| 214 |
+
report.append(f"- 总评估次数: {stats.get('total_evaluations', 0)}\n")
|
| 215 |
+
report.append(f"- 平均得分: {stats.get('avg_score', 0):.3f}\n")
|
| 216 |
+
report.append(f"- 最高得分: {stats.get('max_score', 0):.3f}\n")
|
| 217 |
+
report.append(f"- 最低得分: {stats.get('min_score', 0):.3f}\n")
|
| 218 |
+
report.append(f"- 中位数得分: {stats.get('median_score', 0):.3f}\n")
|
| 219 |
+
report.append(f"- SFT 可用样本: {stats.get('sft_ready_count', 0)}\n\n")
|
| 220 |
+
|
| 221 |
+
# 2. 质量分级分布
|
| 222 |
+
report.append("## 🏆 质量分级分布\n")
|
| 223 |
+
distribution = stats.get("quality_distribution", {})
|
| 224 |
+
for tier, count in sorted(distribution.items()):
|
| 225 |
+
percentage = (count / stats.get('total_evaluations', 1)) * 100
|
| 226 |
+
report.append(f"- {tier.upper()}: {count} ({percentage:.1f}%)\n")
|
| 227 |
+
report.append("\n")
|
| 228 |
+
|
| 229 |
+
# 3. 各层性能
|
| 230 |
+
report.append("## 🎯 各层性能分析\n\n")
|
| 231 |
+
layer_perf = self.layer_performance()
|
| 232 |
+
for layer in ["query_rewrite", "retrieval", "generation", "agentic"]:
|
| 233 |
+
if layer in layer_perf:
|
| 234 |
+
perf = layer_perf[layer]
|
| 235 |
+
report.append(f"### {layer.upper()}\n")
|
| 236 |
+
report.append(f"- 平均得分: {perf['avg']:.3f}\n")
|
| 237 |
+
report.append(f"- 范围: [{perf['min']:.3f}, {perf['max']:.3f}]\n")
|
| 238 |
+
report.append(f"- 样本数: {perf['count']}\n\n")
|
| 239 |
+
|
| 240 |
+
# 4. Bad Case 分类
|
| 241 |
+
report.append("## 🔴 Bad Case 分析\n\n")
|
| 242 |
+
failures = self.categorize_failures()
|
| 243 |
+
for reason, cases in sorted(failures.items(), key=lambda x: -len(x[1])):
|
| 244 |
+
report.append(f"### {reason} ({len(cases)} cases)\n")
|
| 245 |
+
for case in cases[:3]: # 显示top 3
|
| 246 |
+
report.append(f"- 查询: {case.get('query', 'N/A')[:60]}...\n")
|
| 247 |
+
report.append(f" 得分: {case.get('overall_score', 0):.3f}\n")
|
| 248 |
+
report.append("\n")
|
| 249 |
+
|
| 250 |
+
# 5. 推荐行动
|
| 251 |
+
report.append("## 💡 优化建议\n\n")
|
| 252 |
+
recommendations = self.get_recommendations()
|
| 253 |
+
for i, rec in enumerate(recommendations, 1):
|
| 254 |
+
report.append(f"{i}. {rec}\n\n")
|
| 255 |
+
|
| 256 |
+
# 写入文件
|
| 257 |
+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
| 258 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 259 |
+
f.writelines(report)
|
| 260 |
+
|
| 261 |
+
return "".join(report)
|
| 262 |
+
|
| 263 |
+
def export_bad_cases_csv(self, output_file: str = "evaluation/bad_cases.csv") -> None:
|
| 264 |
+
"""导出 Bad Case 为 CSV (用于人工审查)"""
|
| 265 |
+
import csv
|
| 266 |
+
|
| 267 |
+
bad_cases = self.identify_bad_cases()
|
| 268 |
+
|
| 269 |
+
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
| 270 |
+
writer = csv.DictWriter(f, fieldnames=[
|
| 271 |
+
"query", "overall_score", "tier",
|
| 272 |
+
"retrieval_score", "generation_score", "agentic_score",
|
| 273 |
+
"error_message", "timestamp"
|
| 274 |
+
])
|
| 275 |
+
|
| 276 |
+
writer.writeheader()
|
| 277 |
+
for case in bad_cases:
|
| 278 |
+
writer.writerow({
|
| 279 |
+
"query": case.get("query", ""),
|
| 280 |
+
"overall_score": case.get("overall_score", 0),
|
| 281 |
+
"tier": case.get("data_quality_tier", "unknown"),
|
| 282 |
+
"retrieval_score": case.get("retrieval", {}).get("overall_score", 0),
|
| 283 |
+
"generation_score": case.get("generation", {}).get("overall_score", 0),
|
| 284 |
+
"agentic_score": case.get("agentic", {}).get("overall_score", 0),
|
| 285 |
+
"error_message": case.get("error_message", ""),
|
| 286 |
+
"timestamp": case.get("timestamp", "")
|
| 287 |
+
})
|
| 288 |
+
|
| 289 |
+
print(f"✅ Exported {len(bad_cases)} bad cases to {output_file}")
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
# ============================================================================
|
| 293 |
+
# 命令行工具
|
| 294 |
+
# ============================================================================
|
| 295 |
+
|
| 296 |
+
def print_summary(analyzer: EvaluationAnalyzer):
|
| 297 |
+
"""打印摘要"""
|
| 298 |
+
print("\n" + "=" * 70)
|
| 299 |
+
print("📊 评估结果摘要")
|
| 300 |
+
print("=" * 70)
|
| 301 |
+
|
| 302 |
+
stats = analyzer.get_basic_stats()
|
| 303 |
+
|
| 304 |
+
print(f"\n📈 基本统计:")
|
| 305 |
+
print(f" 总评估: {stats.get('total_evaluations', 0)}")
|
| 306 |
+
print(f" 平均分: {stats.get('avg_score', 0):.3f}")
|
| 307 |
+
print(f" 分布: {stats.get('quality_distribution', {})}")
|
| 308 |
+
print(f" SFT可用: {stats.get('sft_ready_count', 0)}")
|
| 309 |
+
|
| 310 |
+
print(f"\n🎯 各层性能:")
|
| 311 |
+
layer_perf = analyzer.layer_performance()
|
| 312 |
+
for layer, perf in layer_perf.items():
|
| 313 |
+
print(f" {layer:.<30} {perf['avg']:.3f} (avg)")
|
| 314 |
+
|
| 315 |
+
print(f"\n🔴 Bad Case Top 5:")
|
| 316 |
+
bad_cases = analyzer.identify_bad_cases()[:5]
|
| 317 |
+
for i, case in enumerate(bad_cases, 1):
|
| 318 |
+
print(f" {i}. {case.get('query', 'N/A')[:40]:<40} Score: {case.get('overall_score', 0):.3f}")
|
| 319 |
+
|
| 320 |
+
print(f"\n💡 优化建议:")
|
| 321 |
+
recommendations = analyzer.get_recommendations()
|
| 322 |
+
for rec in recommendations[:3]:
|
| 323 |
+
print(f" - {rec.split(chr(10))[0]}")
|
| 324 |
+
|
| 325 |
+
print("\n" + "=" * 70)
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def main():
|
| 329 |
+
import sys
|
| 330 |
+
|
| 331 |
+
analyzer = EvaluationAnalyzer()
|
| 332 |
+
|
| 333 |
+
if len(sys.argv) > 1:
|
| 334 |
+
command = sys.argv[1]
|
| 335 |
+
|
| 336 |
+
if command == "summary":
|
| 337 |
+
print_summary(analyzer)
|
| 338 |
+
|
| 339 |
+
elif command == "report":
|
| 340 |
+
report = analyzer.generate_report()
|
| 341 |
+
print(report)
|
| 342 |
+
|
| 343 |
+
elif command == "bad-cases":
|
| 344 |
+
analyzer.export_bad_cases_csv()
|
| 345 |
+
bad_cases = analyzer.identify_bad_cases()
|
| 346 |
+
print(f"\n✅ Found {len(bad_cases)} bad cases")
|
| 347 |
+
print("详见 evaluation/bad_cases.csv")
|
| 348 |
+
|
| 349 |
+
elif command == "layer-perf":
|
| 350 |
+
layer_perf = analyzer.layer_performance()
|
| 351 |
+
print("\n🎯 各层性能:")
|
| 352 |
+
for layer, perf in layer_perf.items():
|
| 353 |
+
print(f"\n{layer.upper()}:")
|
| 354 |
+
print(f" Average: {perf['avg']:.3f}")
|
| 355 |
+
print(f" Range: [{perf['min']:.3f}, {perf['max']:.3f}]")
|
| 356 |
+
print(f" Samples: {perf['count']}")
|
| 357 |
+
|
| 358 |
+
elif command == "recommendations":
|
| 359 |
+
recs = analyzer.get_recommendations()
|
| 360 |
+
print("\n💡 优化建议:\n")
|
| 361 |
+
for i, rec in enumerate(recs, 1):
|
| 362 |
+
print(f"{i}.\n{rec}\n")
|
| 363 |
+
|
| 364 |
+
else:
|
| 365 |
+
print(f"Unknown command: {command}")
|
| 366 |
+
|
| 367 |
+
else:
|
| 368 |
+
print("自动化评估数据分析工具")
|
| 369 |
+
print()
|
| 370 |
+
print("用法:")
|
| 371 |
+
print(" python analyze_eval_results.py summary # 快速摘要")
|
| 372 |
+
print(" python analyze_eval_results.py report # 生成完整报告")
|
| 373 |
+
print(" python analyze_eval_results.py bad-cases # 导出Bad Case")
|
| 374 |
+
print(" python analyze_eval_results.py layer-perf # 各层性能分析")
|
| 375 |
+
print(" python analyze_eval_results.py recommendations # 优化建议")
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
if __name__ == "__main__":
|
| 379 |
+
main()
|
evaluation/clean_and_export_sft_data.py
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
SFT 数据清洗与导出脚本
|
| 4 |
+
|
| 5 |
+
功能:
|
| 6 |
+
1. 从 eval_results.jsonl 读取原始评估数据
|
| 7 |
+
2. 应用严格的质量过滤规则
|
| 8 |
+
3. 转换为标准 SFT 训练格式
|
| 9 |
+
4. 导出为可直接用于训练的数据集
|
| 10 |
+
|
| 11 |
+
Author: Dexter
|
| 12 |
+
Date: 2026-01-28
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from typing import Dict, List, Tuple
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
from evaluation.utils import is_chatty_query, has_code_indicators
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ============================================================================
|
| 25 |
+
# 配置
|
| 26 |
+
# ============================================================================
|
| 27 |
+
|
| 28 |
+
class CleaningConfig:
|
| 29 |
+
"""数据清洗配置"""
|
| 30 |
+
# 质量阈值
|
| 31 |
+
MIN_OVERALL_SCORE = 0.7 # 最低综合分
|
| 32 |
+
MIN_FAITHFULNESS = 0.6 # 最低 faithfulness
|
| 33 |
+
MIN_ANSWER_RELEVANCE = 0.6 # 最低 answer_relevance
|
| 34 |
+
|
| 35 |
+
# 长度阈值
|
| 36 |
+
MIN_QUERY_LENGTH = 10 # 最短 query
|
| 37 |
+
MIN_ANSWER_LENGTH = 100 # 最短 answer
|
| 38 |
+
MIN_CONTEXT_LENGTH = 50 # 最短 context
|
| 39 |
+
MAX_CONTEXT_LENGTH = 4000 # 最长 context(截断)
|
| 40 |
+
|
| 41 |
+
# 必须条件
|
| 42 |
+
REQUIRE_REPO_URL = True # 必须有仓库 URL
|
| 43 |
+
REQUIRE_CODE_IN_CONTEXT = True # 上下文必须包含代码
|
| 44 |
+
|
| 45 |
+
# 输出配置
|
| 46 |
+
OUTPUT_DIR = "evaluation/sft_data/cleaned"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ============================================================================
|
| 50 |
+
# 数据清洗逻辑
|
| 51 |
+
# ============================================================================
|
| 52 |
+
|
| 53 |
+
def validate_sample(sample: Dict, config: CleaningConfig) -> Tuple[bool, str]:
|
| 54 |
+
"""
|
| 55 |
+
验证单个样本是否符合质量标准
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
(is_valid, rejection_reason)
|
| 59 |
+
"""
|
| 60 |
+
# 1. 检查基本字段存在
|
| 61 |
+
if not sample.get("query"):
|
| 62 |
+
return False, "missing_query"
|
| 63 |
+
|
| 64 |
+
if not sample.get("generation"):
|
| 65 |
+
return False, "missing_generation"
|
| 66 |
+
|
| 67 |
+
gen = sample["generation"]
|
| 68 |
+
|
| 69 |
+
# 2. 检查 repo_url
|
| 70 |
+
if config.REQUIRE_REPO_URL and not sample.get("repo_url"):
|
| 71 |
+
return False, "missing_repo_url"
|
| 72 |
+
|
| 73 |
+
# 3. 检查质量分数
|
| 74 |
+
overall_score = sample.get("overall_score", 0)
|
| 75 |
+
if overall_score < config.MIN_OVERALL_SCORE:
|
| 76 |
+
return False, f"low_score:{overall_score:.2f}"
|
| 77 |
+
|
| 78 |
+
faithfulness = gen.get("faithfulness", 0)
|
| 79 |
+
if faithfulness < config.MIN_FAITHFULNESS:
|
| 80 |
+
return False, f"low_faithfulness:{faithfulness:.2f}"
|
| 81 |
+
|
| 82 |
+
answer_relevance = gen.get("answer_relevance", 0)
|
| 83 |
+
if answer_relevance < config.MIN_ANSWER_RELEVANCE:
|
| 84 |
+
return False, f"low_relevance:{answer_relevance:.2f}"
|
| 85 |
+
|
| 86 |
+
# 4. 检查长度
|
| 87 |
+
query = sample.get("query", "")
|
| 88 |
+
if len(query) < config.MIN_QUERY_LENGTH:
|
| 89 |
+
return False, f"short_query:{len(query)}"
|
| 90 |
+
|
| 91 |
+
answer = gen.get("generated_answer", "")
|
| 92 |
+
if len(answer) < config.MIN_ANSWER_LENGTH:
|
| 93 |
+
return False, f"short_answer:{len(answer)}"
|
| 94 |
+
|
| 95 |
+
context = gen.get("retrieved_context", "")
|
| 96 |
+
if len(context) < config.MIN_CONTEXT_LENGTH:
|
| 97 |
+
return False, f"short_context:{len(context)}"
|
| 98 |
+
|
| 99 |
+
# 5. 检查闲聊
|
| 100 |
+
if is_chatty_query(query):
|
| 101 |
+
return False, "chatty_query"
|
| 102 |
+
|
| 103 |
+
# 6. 检查代码存在
|
| 104 |
+
if config.REQUIRE_CODE_IN_CONTEXT and not has_code_indicators(context):
|
| 105 |
+
return False, "no_code_in_context"
|
| 106 |
+
|
| 107 |
+
return True, "passed"
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def transform_to_sft_format(sample: Dict, config: CleaningConfig) -> Dict:
|
| 111 |
+
"""
|
| 112 |
+
将原始评估数据转换为标准 SFT 格式
|
| 113 |
+
"""
|
| 114 |
+
gen = sample["generation"]
|
| 115 |
+
|
| 116 |
+
# 清理和截断 context
|
| 117 |
+
context = gen.get("retrieved_context", "")
|
| 118 |
+
if len(context) > config.MAX_CONTEXT_LENGTH:
|
| 119 |
+
context = context[:config.MAX_CONTEXT_LENGTH] + "\n... [truncated]"
|
| 120 |
+
|
| 121 |
+
# 构建标准 SFT 格式
|
| 122 |
+
sft_sample = {
|
| 123 |
+
# === 核心训练字段 ===
|
| 124 |
+
"instruction": "你是一个专业的GitHub代码仓库分析助手。根据提供的代码上下文,准确回答用户关于代码实现、架构设计、功能逻辑等问题。回答时应该:1) 直接引用相关代码 2) 解释代码的工作原理 3) 如有必要,提供代码示例。",
|
| 125 |
+
"input": f"[用户问题]\n{sample['query']}\n\n[代码上下文]\n{context}",
|
| 126 |
+
"output": gen.get("generated_answer", ""),
|
| 127 |
+
|
| 128 |
+
# === 元数据 ===
|
| 129 |
+
"metadata": {
|
| 130 |
+
"query": sample["query"],
|
| 131 |
+
"repo_url": sample.get("repo_url", ""),
|
| 132 |
+
"language": sample.get("language", "en"),
|
| 133 |
+
"session_id": sample.get("session_id", ""),
|
| 134 |
+
"timestamp": sample.get("timestamp", ""),
|
| 135 |
+
"quality_tier": sample.get("data_quality_tier", ""),
|
| 136 |
+
"overall_score": sample.get("overall_score", 0),
|
| 137 |
+
"faithfulness": gen.get("faithfulness", 0),
|
| 138 |
+
"answer_relevance": gen.get("answer_relevance", 0),
|
| 139 |
+
"answer_completeness": gen.get("answer_completeness", 0),
|
| 140 |
+
"code_correctness": gen.get("code_correctness", 0),
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
return sft_sample
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def clean_and_export(
|
| 148 |
+
input_file: str = "evaluation/sft_data/eval_results.jsonl",
|
| 149 |
+
config: CleaningConfig = None
|
| 150 |
+
) -> Dict:
|
| 151 |
+
"""
|
| 152 |
+
清洗数据并导出
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
统计信息
|
| 156 |
+
"""
|
| 157 |
+
config = config or CleaningConfig()
|
| 158 |
+
|
| 159 |
+
# 创建输出目录
|
| 160 |
+
output_dir = Path(config.OUTPUT_DIR)
|
| 161 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 162 |
+
|
| 163 |
+
# 统计
|
| 164 |
+
stats = {
|
| 165 |
+
"total_read": 0,
|
| 166 |
+
"passed": 0,
|
| 167 |
+
"rejected": 0,
|
| 168 |
+
"rejection_reasons": {},
|
| 169 |
+
"quality_distribution": {"gold": 0, "silver": 0, "bronze": 0}
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
# 输出文件
|
| 173 |
+
output_file = output_dir / f"sft_train_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
|
| 174 |
+
rejected_file = output_dir / f"rejected_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
|
| 175 |
+
|
| 176 |
+
print("=" * 60)
|
| 177 |
+
print("🧹 SFT 数据清洗与导出")
|
| 178 |
+
print("=" * 60)
|
| 179 |
+
print(f"输入文件: {input_file}")
|
| 180 |
+
print(f"输出目录: {output_dir}")
|
| 181 |
+
print(f"质量阈值: score >= {config.MIN_OVERALL_SCORE}")
|
| 182 |
+
print()
|
| 183 |
+
|
| 184 |
+
if not os.path.exists(input_file):
|
| 185 |
+
print(f"❌ 输入文件不存在: {input_file}")
|
| 186 |
+
return stats
|
| 187 |
+
|
| 188 |
+
passed_samples = []
|
| 189 |
+
rejected_samples = []
|
| 190 |
+
|
| 191 |
+
# 读取并处理
|
| 192 |
+
with open(input_file, 'r', encoding='utf-8') as f:
|
| 193 |
+
for line_num, line in enumerate(f, 1):
|
| 194 |
+
try:
|
| 195 |
+
sample = json.loads(line)
|
| 196 |
+
stats["total_read"] += 1
|
| 197 |
+
|
| 198 |
+
# 验证
|
| 199 |
+
is_valid, reason = validate_sample(sample, config)
|
| 200 |
+
|
| 201 |
+
if is_valid:
|
| 202 |
+
# 转换格式
|
| 203 |
+
sft_sample = transform_to_sft_format(sample, config)
|
| 204 |
+
passed_samples.append(sft_sample)
|
| 205 |
+
stats["passed"] += 1
|
| 206 |
+
|
| 207 |
+
# 统计质量分布
|
| 208 |
+
score = sample.get("overall_score", 0)
|
| 209 |
+
if score > 0.9:
|
| 210 |
+
stats["quality_distribution"]["gold"] += 1
|
| 211 |
+
elif score > 0.7:
|
| 212 |
+
stats["quality_distribution"]["silver"] += 1
|
| 213 |
+
else:
|
| 214 |
+
stats["quality_distribution"]["bronze"] += 1
|
| 215 |
+
else:
|
| 216 |
+
rejected_samples.append({
|
| 217 |
+
"reason": reason,
|
| 218 |
+
"query": sample.get("query", "")[:50],
|
| 219 |
+
"score": sample.get("overall_score", 0)
|
| 220 |
+
})
|
| 221 |
+
stats["rejected"] += 1
|
| 222 |
+
stats["rejection_reasons"][reason] = stats["rejection_reasons"].get(reason, 0) + 1
|
| 223 |
+
|
| 224 |
+
except json.JSONDecodeError as e:
|
| 225 |
+
print(f" ⚠️ 第 {line_num} 行 JSON 解析错误: {e}")
|
| 226 |
+
continue
|
| 227 |
+
|
| 228 |
+
# 写入通过的样本
|
| 229 |
+
if passed_samples:
|
| 230 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 231 |
+
for sample in passed_samples:
|
| 232 |
+
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
|
| 233 |
+
print(f"✅ 已导出 {len(passed_samples)} 条高质量样本到: {output_file}")
|
| 234 |
+
|
| 235 |
+
# 写入拒绝的样本(用于分析)
|
| 236 |
+
if rejected_samples:
|
| 237 |
+
with open(rejected_file, 'w', encoding='utf-8') as f:
|
| 238 |
+
for sample in rejected_samples:
|
| 239 |
+
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
|
| 240 |
+
print(f"📝 已记录 {len(rejected_samples)} 条被拒绝样本到: {rejected_file}")
|
| 241 |
+
|
| 242 |
+
# 打印统计
|
| 243 |
+
print()
|
| 244 |
+
print("=" * 60)
|
| 245 |
+
print("📊 统计报告")
|
| 246 |
+
print("=" * 60)
|
| 247 |
+
print(f"总读取: {stats['total_read']}")
|
| 248 |
+
print(f"通过: {stats['passed']} ({stats['passed']/max(stats['total_read'],1)*100:.1f}%)")
|
| 249 |
+
print(f"拒绝: {stats['rejected']} ({stats['rejected']/max(stats['total_read'],1)*100:.1f}%)")
|
| 250 |
+
print()
|
| 251 |
+
print("质量分布:")
|
| 252 |
+
print(f" 🥇 Gold (>0.9): {stats['quality_distribution']['gold']}")
|
| 253 |
+
print(f" 🥈 Silver (>0.7): {stats['quality_distribution']['silver']}")
|
| 254 |
+
print(f" 🥉 Bronze (>0.5): {stats['quality_distribution']['bronze']}")
|
| 255 |
+
print()
|
| 256 |
+
|
| 257 |
+
if stats["rejection_reasons"]:
|
| 258 |
+
print("拒绝原因分布:")
|
| 259 |
+
for reason, count in sorted(stats["rejection_reasons"].items(), key=lambda x: -x[1]):
|
| 260 |
+
print(f" - {reason}: {count}")
|
| 261 |
+
|
| 262 |
+
print()
|
| 263 |
+
print("=" * 60)
|
| 264 |
+
|
| 265 |
+
return stats
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def export_for_training(
|
| 269 |
+
input_file: str,
|
| 270 |
+
output_file: str,
|
| 271 |
+
format_type: str = "alpaca"
|
| 272 |
+
) -> int:
|
| 273 |
+
"""
|
| 274 |
+
将清洗后的数据导出为特定训练格式
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
input_file: 清洗后的 JSONL 文件
|
| 278 |
+
output_file: 输出文件
|
| 279 |
+
format_type: 格式类型 (alpaca, sharegpt, messages)
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
导出的样本数量
|
| 283 |
+
"""
|
| 284 |
+
samples = []
|
| 285 |
+
|
| 286 |
+
with open(input_file, 'r', encoding='utf-8') as f:
|
| 287 |
+
for line in f:
|
| 288 |
+
sample = json.loads(line)
|
| 289 |
+
|
| 290 |
+
if format_type == "alpaca":
|
| 291 |
+
# Alpaca 格式(适用于 LLaMA-Factory 等)
|
| 292 |
+
formatted = {
|
| 293 |
+
"instruction": sample["instruction"],
|
| 294 |
+
"input": sample["input"],
|
| 295 |
+
"output": sample["output"]
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
elif format_type == "sharegpt":
|
| 299 |
+
# ShareGPT 格式
|
| 300 |
+
formatted = {
|
| 301 |
+
"conversations": [
|
| 302 |
+
{"from": "system", "value": sample["instruction"]},
|
| 303 |
+
{"from": "human", "value": sample["input"]},
|
| 304 |
+
{"from": "gpt", "value": sample["output"]}
|
| 305 |
+
]
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
elif format_type == "messages":
|
| 309 |
+
# OpenAI messages 格式
|
| 310 |
+
formatted = {
|
| 311 |
+
"messages": [
|
| 312 |
+
{"role": "system", "content": sample["instruction"]},
|
| 313 |
+
{"role": "user", "content": sample["input"]},
|
| 314 |
+
{"role": "assistant", "content": sample["output"]}
|
| 315 |
+
]
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
else:
|
| 319 |
+
formatted = sample
|
| 320 |
+
|
| 321 |
+
samples.append(formatted)
|
| 322 |
+
|
| 323 |
+
# 写入
|
| 324 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 325 |
+
if output_file.endswith('.json'):
|
| 326 |
+
json.dump(samples, f, ensure_ascii=False, indent=2)
|
| 327 |
+
else:
|
| 328 |
+
for sample in samples:
|
| 329 |
+
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
|
| 330 |
+
|
| 331 |
+
print(f"✅ 已导出 {len(samples)} 条样本为 {format_type} 格式: {output_file}")
|
| 332 |
+
return len(samples)
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
# ============================================================================
|
| 336 |
+
# 主函数
|
| 337 |
+
# ============================================================================
|
| 338 |
+
|
| 339 |
+
if __name__ == "__main__":
|
| 340 |
+
import argparse
|
| 341 |
+
|
| 342 |
+
parser = argparse.ArgumentParser(description="SFT 数据清洗与导出工具")
|
| 343 |
+
parser.add_argument("--input", "-i", default="evaluation/sft_data/eval_results.jsonl",
|
| 344 |
+
help="输入文件路径")
|
| 345 |
+
parser.add_argument("--min-score", "-s", type=float, default=0.7,
|
| 346 |
+
help="最低质量分数 (默认: 0.7)")
|
| 347 |
+
parser.add_argument("--format", "-f", choices=["alpaca", "sharegpt", "messages"],
|
| 348 |
+
default="alpaca", help="导出格式 (默认: alpaca)")
|
| 349 |
+
parser.add_argument("--export", "-e", action="store_true",
|
| 350 |
+
help="同时导出为训练格式")
|
| 351 |
+
|
| 352 |
+
args = parser.parse_args()
|
| 353 |
+
|
| 354 |
+
# 配置
|
| 355 |
+
config = CleaningConfig()
|
| 356 |
+
config.MIN_OVERALL_SCORE = args.min_score
|
| 357 |
+
|
| 358 |
+
# 清洗
|
| 359 |
+
stats = clean_and_export(args.input, config)
|
| 360 |
+
|
| 361 |
+
# 导出为训练格式
|
| 362 |
+
if args.export and stats["passed"] > 0:
|
| 363 |
+
# 找到最新的清洗文件
|
| 364 |
+
output_dir = Path(config.OUTPUT_DIR)
|
| 365 |
+
cleaned_files = sorted(output_dir.glob("sft_train_*.jsonl"), reverse=True)
|
| 366 |
+
if cleaned_files:
|
| 367 |
+
latest_file = cleaned_files[0]
|
| 368 |
+
export_file = output_dir / f"train_{args.format}.jsonl"
|
| 369 |
+
export_for_training(str(latest_file), str(export_file), args.format)
|
evaluation/data_router.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: evaluation/data_router.py
|
| 2 |
+
"""
|
| 3 |
+
数据路由引擎 - 负责 SFT 数据管理和路由
|
| 4 |
+
|
| 5 |
+
根据评估结果将样本路由到不同的数据集
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
from typing import Dict, List, Any
|
| 11 |
+
|
| 12 |
+
from evaluation.models import EvaluationResult, DataQualityTier
|
| 13 |
+
from evaluation.utils import smart_truncate, SFTLengthConfig
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class DataRoutingEngine:
|
| 17 |
+
"""评估驱动的数据路由引擎"""
|
| 18 |
+
|
| 19 |
+
# SFT 训练提示词
|
| 20 |
+
SFT_INSTRUCTION = (
|
| 21 |
+
"你是一个专业的GitHub代码仓库分析助手。根据提供的代码上下文,"
|
| 22 |
+
"准确回答用户关于代码实现、架构设计、功能逻辑等问题。"
|
| 23 |
+
"回答时应该:1) 直接引用相关代码 2) 解释代码的工作原理 3) 如有必要,提供代码示例。"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def __init__(self, output_dir: str = "evaluation/sft_data"):
|
| 27 |
+
self.output_dir = output_dir
|
| 28 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
self.positive_samples_file = os.path.join(output_dir, "positive_samples.jsonl")
|
| 31 |
+
self.negative_samples_file = os.path.join(output_dir, "negative_samples.jsonl")
|
| 32 |
+
self.dpo_pairs_file = os.path.join(output_dir, "dpo_pairs.jsonl")
|
| 33 |
+
self.eval_results_file = os.path.join(output_dir, "eval_results.jsonl")
|
| 34 |
+
|
| 35 |
+
def route_sample(self, eval_result: EvaluationResult) -> str:
|
| 36 |
+
"""路由单个样本,返回数据质量等级"""
|
| 37 |
+
if eval_result.overall_score == 0.0:
|
| 38 |
+
eval_result.compute_overall_score()
|
| 39 |
+
|
| 40 |
+
self.route_data(eval_result)
|
| 41 |
+
return eval_result.data_quality_tier.value
|
| 42 |
+
|
| 43 |
+
def route_data(self, eval_result: EvaluationResult) -> None:
|
| 44 |
+
"""
|
| 45 |
+
根据评估结果路由数据
|
| 46 |
+
|
| 47 |
+
路由规则:
|
| 48 |
+
- score > 0.9 → Gold → positive_samples.jsonl
|
| 49 |
+
- score > 0.6 → Silver → positive_samples.jsonl
|
| 50 |
+
- score > 0.4 → Bronze → negative_samples.jsonl
|
| 51 |
+
- score <= 0.4 → Rejected (不应到达此处,在 auto_eval 中已过滤)
|
| 52 |
+
|
| 53 |
+
注意: eval_results.jsonl 记录所有通过验证的样本,用于分析和审计
|
| 54 |
+
"""
|
| 55 |
+
# 记录所有评估结果(完整审计日志)
|
| 56 |
+
self._append_jsonl(self.eval_results_file, eval_result.to_dict())
|
| 57 |
+
|
| 58 |
+
# 根据质量分级路由到不同的 SFT 数据文件
|
| 59 |
+
if eval_result.overall_score > 0.9:
|
| 60 |
+
# Gold: 高质量正样本
|
| 61 |
+
sft_sample = self._build_sft_sample(eval_result)
|
| 62 |
+
self._append_jsonl(self.positive_samples_file, sft_sample)
|
| 63 |
+
|
| 64 |
+
elif eval_result.overall_score > 0.6:
|
| 65 |
+
# Silver: 可用正样本
|
| 66 |
+
sft_sample = self._build_sft_sample(eval_result)
|
| 67 |
+
self._append_jsonl(self.positive_samples_file, sft_sample)
|
| 68 |
+
|
| 69 |
+
elif eval_result.overall_score > 0.4:
|
| 70 |
+
# Bronze: 负样本,可用于 DPO 或人工修正
|
| 71 |
+
sft_sample = self._build_sft_sample(eval_result, negative=True)
|
| 72 |
+
self._append_jsonl(self.negative_samples_file, sft_sample)
|
| 73 |
+
|
| 74 |
+
# <= 0.4: 不写入任何 SFT 文件(已在 auto_eval 中被拒绝)
|
| 75 |
+
|
| 76 |
+
def _build_sft_sample(self, eval_result: EvaluationResult, negative: bool = False) -> Dict:
|
| 77 |
+
"""
|
| 78 |
+
构建 SFT 训练样本
|
| 79 |
+
|
| 80 |
+
长度限制(基于 SFTLengthConfig):
|
| 81 |
+
- Context: 最大 2500 字符 (~800 tokens)
|
| 82 |
+
- Answer: 最大 3000 字符 (~1000 tokens)
|
| 83 |
+
- 总计: ~2000 tokens,适合 4096 max_length 训练
|
| 84 |
+
"""
|
| 85 |
+
if eval_result.generation_metrics is None:
|
| 86 |
+
return {}
|
| 87 |
+
|
| 88 |
+
cfg = SFTLengthConfig
|
| 89 |
+
|
| 90 |
+
# 1. 截断 Query
|
| 91 |
+
query = eval_result.query
|
| 92 |
+
if len(query) > cfg.MAX_QUERY_CHARS:
|
| 93 |
+
query = query[:cfg.MAX_QUERY_CHARS] + "..."
|
| 94 |
+
|
| 95 |
+
# 2. 智能截断 Context(保留开头 70% + 结尾 30%)
|
| 96 |
+
context = eval_result.generation_metrics.retrieved_context
|
| 97 |
+
context = smart_truncate(context, cfg.MAX_CONTEXT_CHARS, keep_ratio=0.7)
|
| 98 |
+
|
| 99 |
+
# 3. 截断 Answer(保留开头,通常结论在开头)
|
| 100 |
+
answer = eval_result.generation_metrics.generated_answer
|
| 101 |
+
if len(answer) > cfg.MAX_ANSWER_CHARS:
|
| 102 |
+
answer = answer[:cfg.MAX_ANSWER_CHARS] + "\n\n... [回答过长,已截断]"
|
| 103 |
+
|
| 104 |
+
# 4. 构建 input 并检查总长度
|
| 105 |
+
input_text = f"[用户问题]\n{query}\n\n[代码上下文]\n{context}"
|
| 106 |
+
|
| 107 |
+
# 如果总长度仍超限,进一步压缩 context
|
| 108 |
+
total_len = len(self.SFT_INSTRUCTION) + len(input_text) + len(answer)
|
| 109 |
+
if total_len > cfg.MAX_TOTAL_CHARS:
|
| 110 |
+
excess = total_len - cfg.MAX_TOTAL_CHARS
|
| 111 |
+
new_context_len = max(500, len(context) - excess) # 至少保留 500 字符
|
| 112 |
+
context = smart_truncate(
|
| 113 |
+
eval_result.generation_metrics.retrieved_context,
|
| 114 |
+
new_context_len,
|
| 115 |
+
keep_ratio=0.7
|
| 116 |
+
)
|
| 117 |
+
input_text = f"[用户问题]\n{query}\n\n[代码上下文]\n{context}"
|
| 118 |
+
|
| 119 |
+
return {
|
| 120 |
+
"instruction": self.SFT_INSTRUCTION,
|
| 121 |
+
"input": input_text,
|
| 122 |
+
"output": answer,
|
| 123 |
+
"metadata": {
|
| 124 |
+
"query": eval_result.query[:200], # metadata 中也截断,节省空间
|
| 125 |
+
"repo_url": eval_result.repo_url,
|
| 126 |
+
"language": eval_result.language,
|
| 127 |
+
"session_id": eval_result.session_id,
|
| 128 |
+
"timestamp": eval_result.timestamp.isoformat(),
|
| 129 |
+
"quality_tier": eval_result.data_quality_tier.value,
|
| 130 |
+
"overall_score": eval_result.overall_score,
|
| 131 |
+
"faithfulness": eval_result.generation_metrics.faithfulness,
|
| 132 |
+
"answer_relevance": eval_result.generation_metrics.answer_relevance,
|
| 133 |
+
"answer_completeness": eval_result.generation_metrics.answer_completeness,
|
| 134 |
+
"code_correctness": eval_result.generation_metrics.code_correctness,
|
| 135 |
+
"is_negative": negative,
|
| 136 |
+
"sft_ready": eval_result.sft_ready,
|
| 137 |
+
# 记录原始长度,便于分析
|
| 138 |
+
"original_context_len": len(eval_result.generation_metrics.retrieved_context),
|
| 139 |
+
"original_answer_len": len(eval_result.generation_metrics.generated_answer),
|
| 140 |
+
"truncated": len(eval_result.generation_metrics.retrieved_context) > cfg.MAX_CONTEXT_CHARS
|
| 141 |
+
or len(eval_result.generation_metrics.generated_answer) > cfg.MAX_ANSWER_CHARS,
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
def _append_jsonl(self, filepath: str, data: Dict) -> None:
|
| 146 |
+
"""追加数据到 JSONL 文件"""
|
| 147 |
+
with open(filepath, 'a', encoding='utf-8') as f:
|
| 148 |
+
f.write(json.dumps(data, ensure_ascii=False) + '\n')
|
| 149 |
+
|
| 150 |
+
def get_statistics(self) -> Dict[str, int]:
|
| 151 |
+
"""获取当前数据统计"""
|
| 152 |
+
stats = {}
|
| 153 |
+
for name, filepath in [
|
| 154 |
+
("positive", self.positive_samples_file),
|
| 155 |
+
("negative", self.negative_samples_file),
|
| 156 |
+
("dpo_pairs", self.dpo_pairs_file),
|
| 157 |
+
]:
|
| 158 |
+
if os.path.exists(filepath):
|
| 159 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 160 |
+
stats[name] = sum(1 for _ in f)
|
| 161 |
+
else:
|
| 162 |
+
stats[name] = 0
|
| 163 |
+
return stats
|
| 164 |
+
|
| 165 |
+
def get_distribution(self) -> Dict[str, int]:
|
| 166 |
+
"""获取评估结果的质量分布"""
|
| 167 |
+
distribution = {"gold": 0, "silver": 0, "bronze": 0, "rejected": 0, "corrected": 0}
|
| 168 |
+
|
| 169 |
+
if not os.path.exists(self.eval_results_file):
|
| 170 |
+
return distribution
|
| 171 |
+
|
| 172 |
+
try:
|
| 173 |
+
with open(self.eval_results_file, 'r', encoding='utf-8') as f:
|
| 174 |
+
for line in f:
|
| 175 |
+
try:
|
| 176 |
+
result = json.loads(line)
|
| 177 |
+
tier = result.get("data_quality_tier", "bronze")
|
| 178 |
+
if tier in distribution:
|
| 179 |
+
distribution[tier] += 1
|
| 180 |
+
except json.JSONDecodeError:
|
| 181 |
+
continue
|
| 182 |
+
except Exception as e:
|
| 183 |
+
print(f"⚠️ Error reading eval results: {e}")
|
| 184 |
+
|
| 185 |
+
return distribution
|
| 186 |
+
|
| 187 |
+
def get_bad_samples(self, limit: int = 10) -> List[Dict[str, Any]]:
|
| 188 |
+
"""获取低质量样本用于人工审核"""
|
| 189 |
+
bad_samples = []
|
| 190 |
+
|
| 191 |
+
if not os.path.exists(self.eval_results_file):
|
| 192 |
+
return bad_samples
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
with open(self.eval_results_file, 'r', encoding='utf-8') as f:
|
| 196 |
+
for line in f:
|
| 197 |
+
try:
|
| 198 |
+
result = json.loads(line)
|
| 199 |
+
if result.get("overall_score", 0) < 0.5:
|
| 200 |
+
sample = {
|
| 201 |
+
"query": result.get("query", ""),
|
| 202 |
+
"score": result.get("overall_score", 0),
|
| 203 |
+
"issue": result.get("error_message", "Low quality"),
|
| 204 |
+
"quality_tier": result.get("data_quality_tier", "rejected"),
|
| 205 |
+
"timestamp": result.get("timestamp", "")
|
| 206 |
+
}
|
| 207 |
+
if result.get("generation"):
|
| 208 |
+
gen = result["generation"]
|
| 209 |
+
sample.update({
|
| 210 |
+
"faithfulness": gen.get("faithfulness", 0),
|
| 211 |
+
"answer_relevance": gen.get("answer_relevance", 0),
|
| 212 |
+
"answer_completeness": gen.get("answer_completeness", 0),
|
| 213 |
+
})
|
| 214 |
+
bad_samples.append(sample)
|
| 215 |
+
if len(bad_samples) >= limit:
|
| 216 |
+
break
|
| 217 |
+
except json.JSONDecodeError:
|
| 218 |
+
continue
|
| 219 |
+
except Exception as e:
|
| 220 |
+
print(f"⚠️ Error reading bad samples: {e}")
|
| 221 |
+
|
| 222 |
+
return sorted(bad_samples, key=lambda x: x["score"])[:limit]
|
evaluation/evaluation_framework.py
ADDED
|
@@ -0,0 +1,512 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: evaluation/evaluation_framework.py
|
| 2 |
+
"""
|
| 3 |
+
GitHub Agent 完整评估框架
|
| 4 |
+
四层评估架构 + 数据路由引擎
|
| 5 |
+
|
| 6 |
+
Author: Dexter
|
| 7 |
+
Date: 2025-01-27
|
| 8 |
+
|
| 9 |
+
注意: 数据模型已拆分到 models.py,数据路由已拆分到 data_router.py
|
| 10 |
+
此文件保留核心评估引擎逻辑,并重新导出所有符号保持向后兼容
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import re
|
| 16 |
+
from typing import List, Dict, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
|
| 19 |
+
# 重新导出所有模型(保持向后兼容)
|
| 20 |
+
from evaluation.models import (
|
| 21 |
+
EvaluationLayer,
|
| 22 |
+
DataQualityTier,
|
| 23 |
+
QueryRewriteMetrics,
|
| 24 |
+
RetrievalMetrics,
|
| 25 |
+
GenerationMetrics,
|
| 26 |
+
AgenticMetrics,
|
| 27 |
+
EvaluationResult,
|
| 28 |
+
)
|
| 29 |
+
from evaluation.data_router import DataRoutingEngine
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ============================================================================
|
| 33 |
+
# 评估引擎核心逻辑
|
| 34 |
+
# ============================================================================
|
| 35 |
+
|
| 36 |
+
class EvaluationEngine:
|
| 37 |
+
"""评估引擎 - 负责多层面打分"""
|
| 38 |
+
|
| 39 |
+
def __init__(
|
| 40 |
+
self,
|
| 41 |
+
llm_client=None,
|
| 42 |
+
golden_dataset_path: str = "evaluation/golden_dataset.json",
|
| 43 |
+
model_name: str = None
|
| 44 |
+
):
|
| 45 |
+
self.llm_client = llm_client
|
| 46 |
+
self.model_name = model_name or "gpt-4o-mini" # 默认使用轻量模型
|
| 47 |
+
self.golden_dataset = self._load_golden_dataset(golden_dataset_path)
|
| 48 |
+
|
| 49 |
+
def _load_golden_dataset(self, path: str) -> List[Dict]:
|
| 50 |
+
"""加载黄金数据集"""
|
| 51 |
+
if not os.path.exists(path):
|
| 52 |
+
print(f"⚠️ Golden dataset not found at {path}")
|
| 53 |
+
return []
|
| 54 |
+
|
| 55 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 56 |
+
return json.load(f)
|
| 57 |
+
|
| 58 |
+
async def evaluate_query_rewrite(
|
| 59 |
+
self,
|
| 60 |
+
original_query: str,
|
| 61 |
+
rewritten_query: str,
|
| 62 |
+
language_detected: str
|
| 63 |
+
) -> QueryRewriteMetrics:
|
| 64 |
+
"""
|
| 65 |
+
评估查询重写质量
|
| 66 |
+
|
| 67 |
+
指标:
|
| 68 |
+
- keyword_coverage: 重写后的关键词是否覆盖了原Query的核心概念?
|
| 69 |
+
- semantic_preservation: 语义是否保留?
|
| 70 |
+
- diversity_score: 关键词多样性
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
# 简化版: 使用关键词匹配
|
| 74 |
+
original_tokens = set(original_query.lower().split())
|
| 75 |
+
rewritten_tokens = set(rewritten_query.lower().split())
|
| 76 |
+
|
| 77 |
+
# 关键词覆盖度: 原Query的关键词有多少在重写中保留
|
| 78 |
+
if original_tokens:
|
| 79 |
+
coverage = len(original_tokens & rewritten_tokens) / len(original_tokens)
|
| 80 |
+
else:
|
| 81 |
+
coverage = 0.0
|
| 82 |
+
|
| 83 |
+
# 多样性: 重写后的关键词数量越多、越不重复,分数越高
|
| 84 |
+
unique_ratio = len(rewritten_tokens) / max(len(original_tokens), 1)
|
| 85 |
+
diversity = min(1.0, unique_ratio)
|
| 86 |
+
|
| 87 |
+
# 语义保留度 (简化版本: 假设如果覆盖度高就认为语义保留良好)
|
| 88 |
+
semantic_preservation = min(1.0, coverage + 0.2) # 基础分+覆盖度加分
|
| 89 |
+
|
| 90 |
+
return QueryRewriteMetrics(
|
| 91 |
+
original_query=original_query,
|
| 92 |
+
rewritten_query=rewritten_query,
|
| 93 |
+
language_detected=language_detected,
|
| 94 |
+
keyword_coverage=coverage,
|
| 95 |
+
semantic_preservation=semantic_preservation,
|
| 96 |
+
diversity_score=diversity
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
async def evaluate_retrieval(
|
| 100 |
+
self,
|
| 101 |
+
query: str,
|
| 102 |
+
retrieved_files: List[str],
|
| 103 |
+
ground_truth_files: List[str],
|
| 104 |
+
top_k: int = 5,
|
| 105 |
+
retrieval_latency_ms: float = 0,
|
| 106 |
+
vector_scores: List[float] = None,
|
| 107 |
+
bm25_scores: List[float] = None
|
| 108 |
+
) -> RetrievalMetrics:
|
| 109 |
+
"""
|
| 110 |
+
评估检索层质量
|
| 111 |
+
|
| 112 |
+
指标:
|
| 113 |
+
- hit_rate: 是否找到了任何正确的文件?
|
| 114 |
+
- recall_at_k: 前K个中有多少是正确的?
|
| 115 |
+
- precision_at_k: 返回的文件中有多少是正确的?
|
| 116 |
+
- mrr: 第一个正确结果的排名倒数
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
retrieved_set = set(retrieved_files[:top_k])
|
| 120 |
+
ground_truth_set = set(ground_truth_files)
|
| 121 |
+
|
| 122 |
+
# Hit rate: 是否有交集
|
| 123 |
+
hit_rate = 1.0 if retrieved_set & ground_truth_set else 0.0
|
| 124 |
+
|
| 125 |
+
# Recall@K: 找到的正确结果数 / 正确结果总数
|
| 126 |
+
correct_count = len(retrieved_set & ground_truth_set)
|
| 127 |
+
recall = correct_count / len(ground_truth_set) if ground_truth_set else 0.0
|
| 128 |
+
|
| 129 |
+
# Precision@K: 找到的正确结果数 / 返回的结果总数
|
| 130 |
+
precision = correct_count / len(retrieved_set) if retrieved_set else 0.0
|
| 131 |
+
|
| 132 |
+
# MRR: 第一个正确结果的倒数排名
|
| 133 |
+
mrr = 0.0
|
| 134 |
+
for i, file in enumerate(retrieved_files[:top_k], 1):
|
| 135 |
+
if file in ground_truth_set:
|
| 136 |
+
mrr = 1.0 / i
|
| 137 |
+
break
|
| 138 |
+
|
| 139 |
+
# Context Relevance: 简化版 - 假设Precision反映了相关性
|
| 140 |
+
context_relevance = precision
|
| 141 |
+
|
| 142 |
+
# Chunk Integrity: 简化版 - 假设没有太多文件就认为完���度高
|
| 143 |
+
chunk_integrity = min(1.0, 1.0 / len(retrieved_set)) if retrieved_set else 0.0
|
| 144 |
+
|
| 145 |
+
vector_avg = sum(vector_scores) / len(vector_scores) if vector_scores else 0.0
|
| 146 |
+
bm25_avg = sum(bm25_scores) / len(bm25_scores) if bm25_scores else 0.0
|
| 147 |
+
|
| 148 |
+
return RetrievalMetrics(
|
| 149 |
+
query=query,
|
| 150 |
+
top_k=top_k,
|
| 151 |
+
hit_rate=hit_rate,
|
| 152 |
+
recall_at_k=recall,
|
| 153 |
+
precision_at_k=precision,
|
| 154 |
+
mrr=mrr,
|
| 155 |
+
context_relevance=context_relevance,
|
| 156 |
+
chunk_integrity=chunk_integrity,
|
| 157 |
+
retrieval_latency_ms=retrieval_latency_ms,
|
| 158 |
+
vector_score_avg=vector_avg,
|
| 159 |
+
bm25_score_avg=bm25_avg,
|
| 160 |
+
retrieved_files=retrieved_files,
|
| 161 |
+
ground_truth_files=ground_truth_files
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
async def evaluate_generation(
|
| 165 |
+
self,
|
| 166 |
+
query: str,
|
| 167 |
+
retrieved_context: str,
|
| 168 |
+
generated_answer: str,
|
| 169 |
+
ground_truth_answer: str = "",
|
| 170 |
+
generation_latency_ms: float = 0,
|
| 171 |
+
token_usage: Dict[str, int] = None
|
| 172 |
+
) -> GenerationMetrics:
|
| 173 |
+
"""
|
| 174 |
+
评估生成层质量
|
| 175 |
+
|
| 176 |
+
指标:
|
| 177 |
+
- faithfulness: 回答是否严格基于Context?
|
| 178 |
+
- answer_relevance: 回答是否回答了问题?
|
| 179 |
+
- answer_completeness: 回答是否足够完整?
|
| 180 |
+
- code_correctness: 生成的代码是否正确?
|
| 181 |
+
"""
|
| 182 |
+
|
| 183 |
+
# 1. Faithfulness: 使用LLM-as-Judge进行幻觉检测
|
| 184 |
+
faithfulness = await self._judge_faithfulness(
|
| 185 |
+
retrieved_context,
|
| 186 |
+
generated_answer
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
# 2. Answer Relevance: 回答和问题的相似度
|
| 190 |
+
answer_relevance = await self._judge_answer_relevance(
|
| 191 |
+
query,
|
| 192 |
+
generated_answer
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# 3. Answer Completeness: 简化版 - 通过长度和结构判断
|
| 196 |
+
completeness = self._judge_completeness(
|
| 197 |
+
generated_answer,
|
| 198 |
+
ground_truth_answer
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# 4. Code Correctness: 使用AST检查代码块
|
| 202 |
+
code_samples = self._extract_code_blocks(generated_answer)
|
| 203 |
+
code_correctness = self._check_code_correctness(code_samples)
|
| 204 |
+
|
| 205 |
+
metrics = GenerationMetrics(
|
| 206 |
+
query=query,
|
| 207 |
+
retrieved_context=retrieved_context,
|
| 208 |
+
generated_answer=generated_answer,
|
| 209 |
+
ground_truth_answer=ground_truth_answer,
|
| 210 |
+
faithfulness=faithfulness,
|
| 211 |
+
answer_relevance=answer_relevance,
|
| 212 |
+
answer_completeness=completeness,
|
| 213 |
+
code_correctness=code_correctness,
|
| 214 |
+
generated_code_samples=code_samples,
|
| 215 |
+
generation_latency_ms=generation_latency_ms,
|
| 216 |
+
token_usage=token_usage or {"input": 0, "output": 0}
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
return metrics
|
| 220 |
+
|
| 221 |
+
async def _judge_faithfulness(self, context: str, answer: str) -> float:
|
| 222 |
+
"""
|
| 223 |
+
LLM-as-Judge: 判断回答是否由Context支撑
|
| 224 |
+
返回 0-1 的分数
|
| 225 |
+
|
| 226 |
+
注意:Faithfulness 判断的是"回答中的信息是否能从 Context 中找到依据"
|
| 227 |
+
而不是"回答是否完全复制 Context 内容"
|
| 228 |
+
"""
|
| 229 |
+
if not self.llm_client:
|
| 230 |
+
# 简化版: 如果没有LLM客户端,使用启发式方法
|
| 231 |
+
# 统计Answer中的关键词有多少出现在Context中
|
| 232 |
+
context_lower = context.lower()
|
| 233 |
+
answer_words = set(answer.lower().split())
|
| 234 |
+
# 过滤掉常见停用词
|
| 235 |
+
stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
|
| 236 |
+
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
| 237 |
+
'would', 'could', 'should', 'may', 'might', 'must', 'shall',
|
| 238 |
+
'can', 'need', 'dare', 'ought', 'used', 'to', 'of', 'in',
|
| 239 |
+
'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'that',
|
| 240 |
+
'which', 'who', 'whom', 'this', 'these', 'those', 'it', 'its'}
|
| 241 |
+
meaningful_words = answer_words - stop_words
|
| 242 |
+
if not meaningful_words:
|
| 243 |
+
return 0.7 # 没有有意义的词,给默认分
|
| 244 |
+
# 计算答案中有多少有意义的词出现在Context中
|
| 245 |
+
found_count = sum(1 for word in meaningful_words if word in context_lower)
|
| 246 |
+
overlap = found_count / len(meaningful_words)
|
| 247 |
+
return min(1.0, overlap + 0.2) # 给一定的基础分
|
| 248 |
+
|
| 249 |
+
# 智能截取 Context:提取与 Answer 相关的部分
|
| 250 |
+
# 如果 Context 太长,优先包含 Answer 中提到的关键词附近的内容
|
| 251 |
+
max_context_len = 6000 # 增加到 6000 字符
|
| 252 |
+
if len(context) > max_context_len:
|
| 253 |
+
# 尝试找到 Answer 中提到的关键文件/函数名
|
| 254 |
+
import re
|
| 255 |
+
# 提取 Answer 中可能的文件路径或函数名
|
| 256 |
+
patterns = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*', answer[:500])
|
| 257 |
+
important_terms = [p for p in patterns if len(p) > 3][:5] # 取前5个重要词
|
| 258 |
+
|
| 259 |
+
# 优先截取包含这些词的部分
|
| 260 |
+
context_parts = []
|
| 261 |
+
remaining = max_context_len
|
| 262 |
+
for term in important_terms:
|
| 263 |
+
idx = context.find(term)
|
| 264 |
+
if idx != -1 and remaining > 0:
|
| 265 |
+
start = max(0, idx - 300)
|
| 266 |
+
end = min(len(context), idx + 700)
|
| 267 |
+
snippet = context[start:end]
|
| 268 |
+
if snippet not in ''.join(context_parts):
|
| 269 |
+
context_parts.append(snippet)
|
| 270 |
+
remaining -= len(snippet)
|
| 271 |
+
|
| 272 |
+
# 如果没找到相关部分,还是用前 6000 字符
|
| 273 |
+
if context_parts:
|
| 274 |
+
truncated_context = "\n...\n".join(context_parts)
|
| 275 |
+
else:
|
| 276 |
+
truncated_context = context[:max_context_len]
|
| 277 |
+
else:
|
| 278 |
+
truncated_context = context
|
| 279 |
+
|
| 280 |
+
# 改进的 Prompt:更明确定义 Faithfulness
|
| 281 |
+
prompt = f"""Evaluate the FAITHFULNESS of the answer to the given context.
|
| 282 |
+
|
| 283 |
+
FAITHFULNESS means: The claims and information in the answer can be verified from or are consistent with the context.
|
| 284 |
+
- Score HIGH (0.7-1.0) if the answer correctly identifies or explains concepts that ARE in the context
|
| 285 |
+
- Score MEDIUM (0.4-0.7) if the answer is partially supported but makes some unsupported claims
|
| 286 |
+
- Score LOW (0.0-0.4) if the answer contradicts the context or makes completely unsupported claims
|
| 287 |
+
|
| 288 |
+
NOTE: If the answer says "X is not in the context" and X is indeed not shown, that's a FAITHFUL statement (score 0.7+)
|
| 289 |
+
NOTE: If the answer correctly identifies WHERE something is defined based on imports/references in context, that's FAITHFUL
|
| 290 |
+
|
| 291 |
+
[Context]
|
| 292 |
+
{truncated_context}
|
| 293 |
+
|
| 294 |
+
[Answer]
|
| 295 |
+
{answer[:1500]}
|
| 296 |
+
|
| 297 |
+
SCORE (0.0-1.0):"""
|
| 298 |
+
|
| 299 |
+
try:
|
| 300 |
+
response = await self.llm_client.chat.completions.create(
|
| 301 |
+
model=self.model_name,
|
| 302 |
+
messages=[{"role": "user", "content": prompt}],
|
| 303 |
+
temperature=0.1,
|
| 304 |
+
max_tokens=10
|
| 305 |
+
)
|
| 306 |
+
score_str = response.choices[0].message.content.strip()
|
| 307 |
+
# 提取数字(处理可能的额外文本)
|
| 308 |
+
import re
|
| 309 |
+
match = re.search(r'(\d+\.?\d*)', score_str)
|
| 310 |
+
if match:
|
| 311 |
+
score = float(match.group(1))
|
| 312 |
+
else:
|
| 313 |
+
score = float(score_str)
|
| 314 |
+
return min(1.0, max(0.0, score))
|
| 315 |
+
except Exception as e:
|
| 316 |
+
print(f"⚠️ Faithfulness judgment failed: {e}")
|
| 317 |
+
return 0.5
|
| 318 |
+
|
| 319 |
+
async def _judge_answer_relevance(self, query: str, answer: str) -> float:
|
| 320 |
+
"""判断回答与问题的相关性"""
|
| 321 |
+
if not self.llm_client:
|
| 322 |
+
# 简化版: 使用关键词重叠度
|
| 323 |
+
query_words = set(query.lower().split())
|
| 324 |
+
answer_words = set(answer.lower().split())
|
| 325 |
+
overlap = len(query_words & answer_words) / max(len(query_words), 1)
|
| 326 |
+
return min(1.0, overlap + 0.3) # 基础分0.3+重叠度
|
| 327 |
+
|
| 328 |
+
prompt = f"""
|
| 329 |
+
Does the answer address the query?
|
| 330 |
+
|
| 331 |
+
[Query]
|
| 332 |
+
{query}
|
| 333 |
+
|
| 334 |
+
[Answer]
|
| 335 |
+
{answer[:1000]}
|
| 336 |
+
|
| 337 |
+
Score (0.0-1.0):
|
| 338 |
+
"""
|
| 339 |
+
|
| 340 |
+
try:
|
| 341 |
+
response = await self.llm_client.chat.completions.create(
|
| 342 |
+
model=self.model_name,
|
| 343 |
+
messages=[{"role": "user", "content": prompt}],
|
| 344 |
+
temperature=0.1,
|
| 345 |
+
max_tokens=10
|
| 346 |
+
)
|
| 347 |
+
score = float(response.choices[0].message.content.strip())
|
| 348 |
+
return min(1.0, max(0.0, score))
|
| 349 |
+
except:
|
| 350 |
+
return 0.5
|
| 351 |
+
|
| 352 |
+
def _judge_completeness(self, generated_answer: str, ground_truth: str = "") -> float:
|
| 353 |
+
"""判断回答的完整性"""
|
| 354 |
+
# 简化版: 根据长度和结构
|
| 355 |
+
if len(generated_answer) < 50:
|
| 356 |
+
return 0.3
|
| 357 |
+
elif len(generated_answer) < 200:
|
| 358 |
+
return 0.6
|
| 359 |
+
else:
|
| 360 |
+
return 0.9
|
| 361 |
+
|
| 362 |
+
def _extract_code_blocks(self, text: str) -> List[str]:
|
| 363 |
+
"""从文本中提取代码块"""
|
| 364 |
+
import re
|
| 365 |
+
code_pattern = r'```[\w]*\n(.*?)\n```'
|
| 366 |
+
matches = re.findall(code_pattern, text, re.DOTALL)
|
| 367 |
+
return matches
|
| 368 |
+
|
| 369 |
+
def _check_code_correctness(self, code_samples: List[str]) -> float:
|
| 370 |
+
"""检查代码是否有语法错误"""
|
| 371 |
+
if not code_samples:
|
| 372 |
+
return 1.0 # 没有代码就认为正确
|
| 373 |
+
|
| 374 |
+
import ast
|
| 375 |
+
correct_count = 0
|
| 376 |
+
for code in code_samples:
|
| 377 |
+
try:
|
| 378 |
+
ast.parse(code)
|
| 379 |
+
correct_count += 1
|
| 380 |
+
except SyntaxError:
|
| 381 |
+
pass
|
| 382 |
+
|
| 383 |
+
return correct_count / len(code_samples)
|
| 384 |
+
|
| 385 |
+
async def evaluate_agentic(
|
| 386 |
+
self,
|
| 387 |
+
query: str,
|
| 388 |
+
tool_calls: List[Dict[str, Any]],
|
| 389 |
+
success: bool,
|
| 390 |
+
steps_taken: int = 0,
|
| 391 |
+
end_to_end_latency_ms: float = 0
|
| 392 |
+
) -> AgenticMetrics:
|
| 393 |
+
"""
|
| 394 |
+
评估Agent的决策和行为
|
| 395 |
+
"""
|
| 396 |
+
|
| 397 |
+
# Tool Selection Accuracy: 工具选择是否正确?
|
| 398 |
+
tool_selection_accuracy = 1.0 if success else 0.5
|
| 399 |
+
|
| 400 |
+
# Tool Parameter Correctness: 参数是否正确传递?
|
| 401 |
+
tool_param_correctness = 1.0 if all(
|
| 402 |
+
tc.get("success", False) for tc in tool_calls
|
| 403 |
+
) else 0.5
|
| 404 |
+
|
| 405 |
+
# 计算冗余步骤
|
| 406 |
+
unnecessary_steps = 0
|
| 407 |
+
backtrack_count = 0
|
| 408 |
+
|
| 409 |
+
# 简化版: 如果有重复的工具调用则视为冗余
|
| 410 |
+
tool_call_signatures = [tc.get("name", "") for tc in tool_calls]
|
| 411 |
+
for i, sig in enumerate(tool_call_signatures):
|
| 412 |
+
if i > 0 and sig == tool_call_signatures[i-1]:
|
| 413 |
+
unnecessary_steps += 1
|
| 414 |
+
|
| 415 |
+
return AgenticMetrics(
|
| 416 |
+
query=query,
|
| 417 |
+
tool_calls=tool_calls,
|
| 418 |
+
tool_selection_accuracy=tool_selection_accuracy,
|
| 419 |
+
tool_parameter_correctness=tool_param_correctness,
|
| 420 |
+
steps_taken=steps_taken,
|
| 421 |
+
unnecessary_steps=unnecessary_steps,
|
| 422 |
+
backtrack_count=backtrack_count,
|
| 423 |
+
success=success,
|
| 424 |
+
end_to_end_latency_ms=end_to_end_latency_ms
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
def get_statistics(self) -> Dict[str, Any]:
|
| 428 |
+
"""
|
| 429 |
+
获取评估统计信息
|
| 430 |
+
|
| 431 |
+
Returns:
|
| 432 |
+
包含 total_evaluations, average_score, quality_distribution, top_issues 的字典
|
| 433 |
+
"""
|
| 434 |
+
# 从 eval_results.jsonl 读取评估结果
|
| 435 |
+
eval_results_path = "evaluation/sft_data/eval_results.jsonl"
|
| 436 |
+
|
| 437 |
+
stats = {
|
| 438 |
+
"total_evaluations": 0,
|
| 439 |
+
"average_score": 0.0,
|
| 440 |
+
"quality_distribution": {
|
| 441 |
+
"gold": 0,
|
| 442 |
+
"silver": 0,
|
| 443 |
+
"bronze": 0,
|
| 444 |
+
"rejected": 0
|
| 445 |
+
},
|
| 446 |
+
"top_issues": []
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
if not os.path.exists(eval_results_path):
|
| 450 |
+
return stats
|
| 451 |
+
|
| 452 |
+
# 读取和分析评估结果
|
| 453 |
+
scores = []
|
| 454 |
+
issues = {}
|
| 455 |
+
|
| 456 |
+
try:
|
| 457 |
+
with open(eval_results_path, 'r', encoding='utf-8') as f:
|
| 458 |
+
for line in f:
|
| 459 |
+
try:
|
| 460 |
+
result = json.loads(line)
|
| 461 |
+
stats["total_evaluations"] += 1
|
| 462 |
+
|
| 463 |
+
# 收集得分
|
| 464 |
+
score = result.get("overall_score", 0)
|
| 465 |
+
scores.append(score)
|
| 466 |
+
|
| 467 |
+
# 统计质量分布
|
| 468 |
+
tier = result.get("data_quality_tier", "bronze")
|
| 469 |
+
if tier in stats["quality_distribution"]:
|
| 470 |
+
stats["quality_distribution"][tier] += 1
|
| 471 |
+
|
| 472 |
+
# 收集常见问题 (假设记录在 notes 或 error_message 中)
|
| 473 |
+
note = result.get("notes", "") or result.get("error_message", "")
|
| 474 |
+
if note:
|
| 475 |
+
issues[note] = issues.get(note, 0) + 1
|
| 476 |
+
except json.JSONDecodeError:
|
| 477 |
+
continue
|
| 478 |
+
except Exception as e:
|
| 479 |
+
print(f"⚠️ Error reading eval results: {e}")
|
| 480 |
+
|
| 481 |
+
# 计算平均分
|
| 482 |
+
if scores:
|
| 483 |
+
stats["average_score"] = sum(scores) / len(scores)
|
| 484 |
+
|
| 485 |
+
# 获取前5个常见问题
|
| 486 |
+
if issues:
|
| 487 |
+
stats["top_issues"] = [
|
| 488 |
+
{"issue": issue, "count": count}
|
| 489 |
+
for issue, count in sorted(issues.items(), key=lambda x: x[1], reverse=True)[:5]
|
| 490 |
+
]
|
| 491 |
+
|
| 492 |
+
return stats
|
| 493 |
+
|
| 494 |
+
|
| 495 |
+
# ============================================================================
|
| 496 |
+
# __all__ 导出列表(保持向后兼容)
|
| 497 |
+
# ============================================================================
|
| 498 |
+
|
| 499 |
+
__all__ = [
|
| 500 |
+
# 枚举
|
| 501 |
+
"EvaluationLayer",
|
| 502 |
+
"DataQualityTier",
|
| 503 |
+
# 数据模型
|
| 504 |
+
"QueryRewriteMetrics",
|
| 505 |
+
"RetrievalMetrics",
|
| 506 |
+
"GenerationMetrics",
|
| 507 |
+
"AgenticMetrics",
|
| 508 |
+
"EvaluationResult",
|
| 509 |
+
# 引擎
|
| 510 |
+
"EvaluationEngine",
|
| 511 |
+
"DataRoutingEngine",
|
| 512 |
+
]
|
evaluation/golden_dataset_builder.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: evaluation/golden_dataset_builder.py
|
| 2 |
+
"""
|
| 3 |
+
黄金数据集构建工具
|
| 4 |
+
用于快速构建评估所需的标注数据集
|
| 5 |
+
|
| 6 |
+
使用场景:
|
| 7 |
+
1. 初始化: 为新项目快速创建 50 条测试用例
|
| 8 |
+
2. 扩展: 定期添加新的问题和标注
|
| 9 |
+
3. 验证: 自动验证数据集的完整性
|
| 10 |
+
|
| 11 |
+
Author: Dexter
|
| 12 |
+
Date: 2025-01-27
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
from typing import List, Dict, Optional
|
| 18 |
+
from dataclasses import dataclass, asdict
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class GoldenSample:
|
| 24 |
+
"""黄金数据集样本"""
|
| 25 |
+
id: str # 唯一ID
|
| 26 |
+
description: str # 问题描述 (用于标注人员理解问题类型)
|
| 27 |
+
query: str # 用户查询
|
| 28 |
+
expected_files: List[str] # 标准答案: 应该返回的文件列表
|
| 29 |
+
expected_answer: str = "" # 标准答案: 预期回答 (可选)
|
| 30 |
+
difficulty: str = "medium" # 难度: easy/medium/hard
|
| 31 |
+
category: str = "general" # 类别: general/code_finding/architecture/workflow
|
| 32 |
+
language: str = "en" # 语言: en/zh
|
| 33 |
+
created_at: str = ""
|
| 34 |
+
|
| 35 |
+
def __post_init__(self):
|
| 36 |
+
if not self.created_at:
|
| 37 |
+
self.created_at = datetime.now().isoformat()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class GoldenDatasetBuilder:
|
| 41 |
+
"""黄金数据集构建器"""
|
| 42 |
+
|
| 43 |
+
def __init__(self, filepath: str = "evaluation/golden_dataset.json"):
|
| 44 |
+
self.filepath = filepath
|
| 45 |
+
self.samples: List[GoldenSample] = []
|
| 46 |
+
self.load()
|
| 47 |
+
|
| 48 |
+
def load(self):
|
| 49 |
+
"""加载现有数据集"""
|
| 50 |
+
if os.path.exists(self.filepath):
|
| 51 |
+
with open(self.filepath, 'r', encoding='utf-8') as f:
|
| 52 |
+
try:
|
| 53 |
+
raw_data = json.load(f)
|
| 54 |
+
# 兼容旧格式 (直接是字典列表)
|
| 55 |
+
if isinstance(raw_data, list):
|
| 56 |
+
self.samples = [
|
| 57 |
+
GoldenSample(**item) if isinstance(item, dict) and "id" in item
|
| 58 |
+
else GoldenSample(
|
| 59 |
+
id=str(len(self.samples)),
|
| 60 |
+
description=item.get("description", ""),
|
| 61 |
+
query=item.get("query", ""),
|
| 62 |
+
expected_files=[item.get("answer_file", "")] if item.get("answer_file") else []
|
| 63 |
+
)
|
| 64 |
+
for item in raw_data
|
| 65 |
+
]
|
| 66 |
+
except:
|
| 67 |
+
self.samples = []
|
| 68 |
+
|
| 69 |
+
def save(self):
|
| 70 |
+
"""保存数据集"""
|
| 71 |
+
os.makedirs(os.path.dirname(self.filepath), exist_ok=True)
|
| 72 |
+
data = [asdict(s) for s in self.samples]
|
| 73 |
+
with open(self.filepath, 'w', encoding='utf-8') as f:
|
| 74 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 75 |
+
|
| 76 |
+
def add_sample(self, sample: GoldenSample):
|
| 77 |
+
"""添加样本"""
|
| 78 |
+
sample.id = f"sample_{len(self.samples):04d}"
|
| 79 |
+
self.samples.append(sample)
|
| 80 |
+
|
| 81 |
+
def add_samples_batch(self, samples: List[GoldenSample]):
|
| 82 |
+
"""批量添加样本"""
|
| 83 |
+
for sample in samples:
|
| 84 |
+
self.add_sample(sample)
|
| 85 |
+
|
| 86 |
+
def get_samples_by_category(self, category: str) -> List[GoldenSample]:
|
| 87 |
+
"""按类别筛选"""
|
| 88 |
+
return [s for s in self.samples if s.category == category]
|
| 89 |
+
|
| 90 |
+
def get_samples_by_difficulty(self, difficulty: str) -> List[GoldenSample]:
|
| 91 |
+
"""按难度筛选"""
|
| 92 |
+
return [s for s in self.samples if s.difficulty == difficulty]
|
| 93 |
+
|
| 94 |
+
def get_statistics(self) -> Dict:
|
| 95 |
+
"""获取统计信息"""
|
| 96 |
+
stats = {
|
| 97 |
+
"total": len(self.samples),
|
| 98 |
+
"by_category": {},
|
| 99 |
+
"by_difficulty": {},
|
| 100 |
+
"by_language": {}
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
for s in self.samples:
|
| 104 |
+
stats["by_category"][s.category] = stats["by_category"].get(s.category, 0) + 1
|
| 105 |
+
stats["by_difficulty"][s.difficulty] = stats["by_difficulty"].get(s.difficulty, 0) + 1
|
| 106 |
+
stats["by_language"][s.language] = stats["by_language"].get(s.language, 0) + 1
|
| 107 |
+
|
| 108 |
+
return stats
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ============================================================================
|
| 112 |
+
# 预定义的通用问题模板
|
| 113 |
+
# ============================================================================
|
| 114 |
+
|
| 115 |
+
# 针对 FastAPI 项目的初始数据集 (参考你现有的 golden_dataset.json)
|
| 116 |
+
FASTAPI_GOLDEN_SAMPLES = [
|
| 117 |
+
# Easy: 代码位置查找
|
| 118 |
+
GoldenSample(
|
| 119 |
+
id="",
|
| 120 |
+
description="简单函数查找",
|
| 121 |
+
query="Where is the 'serialize_response' function?",
|
| 122 |
+
expected_files=["fastapi/routing.py"],
|
| 123 |
+
difficulty="easy",
|
| 124 |
+
category="code_finding"
|
| 125 |
+
),
|
| 126 |
+
|
| 127 |
+
# Medium: 理解数据流
|
| 128 |
+
GoldenSample(
|
| 129 |
+
id="",
|
| 130 |
+
description="理解核心模块职责",
|
| 131 |
+
query="How does dependency injection work in FastAPI?",
|
| 132 |
+
expected_files=["fastapi/dependencies/utils.py", "fastapi/depends.py"],
|
| 133 |
+
difficulty="medium",
|
| 134 |
+
category="architecture"
|
| 135 |
+
),
|
| 136 |
+
|
| 137 |
+
# Hard: 跨文件理解工作流
|
| 138 |
+
GoldenSample(
|
| 139 |
+
id="",
|
| 140 |
+
description="完整工作流理解",
|
| 141 |
+
query="Show me the complete flow from request to response in FastAPI",
|
| 142 |
+
expected_files=["fastapi/routing.py", "fastapi/applications.py", "fastapi/dependencies/utils.py"],
|
| 143 |
+
difficulty="hard",
|
| 144 |
+
category="workflow"
|
| 145 |
+
),
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
# GitHub Agent 项目的初始数据集
|
| 149 |
+
GITHUB_AGENT_GOLDEN_SAMPLES = [
|
| 150 |
+
GoldenSample(
|
| 151 |
+
id="",
|
| 152 |
+
description="检索核心逻辑",
|
| 153 |
+
query="How is chunk_file method implemented?",
|
| 154 |
+
expected_files=["app/services/chunking_service.py"],
|
| 155 |
+
expected_answer="The chunk_file method is implemented in chunking_service.py. It takes content and file_path as parameters and uses AST parsing for Python files to intelligently chunk the code.",
|
| 156 |
+
difficulty="easy",
|
| 157 |
+
category="code_finding",
|
| 158 |
+
language="en"
|
| 159 |
+
),
|
| 160 |
+
|
| 161 |
+
GoldenSample(
|
| 162 |
+
id="",
|
| 163 |
+
description="向量搜索机制",
|
| 164 |
+
query="What vector database is used for retrieval?",
|
| 165 |
+
expected_files=["app/services/vector_service.py"],
|
| 166 |
+
difficulty="medium",
|
| 167 |
+
category="architecture",
|
| 168 |
+
language="en"
|
| 169 |
+
),
|
| 170 |
+
|
| 171 |
+
GoldenSample(
|
| 172 |
+
id="",
|
| 173 |
+
description="完整分析流程",
|
| 174 |
+
query="How does the agent analyze a GitHub repository?",
|
| 175 |
+
expected_files=["app/services/agent_service.py", "app/services/chunking_service.py", "app/services/vector_service.py"],
|
| 176 |
+
difficulty="hard",
|
| 177 |
+
category="workflow",
|
| 178 |
+
language="en"
|
| 179 |
+
),
|
| 180 |
+
]
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# ============================================================================
|
| 184 |
+
# 交互式数据集构建工具
|
| 185 |
+
# ============================================================================
|
| 186 |
+
|
| 187 |
+
def interactive_builder():
|
| 188 |
+
"""交互式构建黄金数据集"""
|
| 189 |
+
builder = GoldenDatasetBuilder()
|
| 190 |
+
|
| 191 |
+
print("=" * 60)
|
| 192 |
+
print("🛠️ 黄金数据集构建工具")
|
| 193 |
+
print("=" * 60)
|
| 194 |
+
|
| 195 |
+
while True:
|
| 196 |
+
print("\n请选择操作:")
|
| 197 |
+
print("1. 添加新样本")
|
| 198 |
+
print("2. 查看现有样本")
|
| 199 |
+
print("3. 按类别筛选")
|
| 200 |
+
print("4. 统计信息")
|
| 201 |
+
print("5. 保存并退出")
|
| 202 |
+
print("0. 退出(不保存)")
|
| 203 |
+
|
| 204 |
+
choice = input("请输入选项 (0-5): ").strip()
|
| 205 |
+
|
| 206 |
+
if choice == "1":
|
| 207 |
+
sample = GoldenSample(
|
| 208 |
+
id="",
|
| 209 |
+
description=input("📝 描述 (问题类型): "),
|
| 210 |
+
query=input("❓ 查询/问题: "),
|
| 211 |
+
expected_files=input("📁 预期文件 (逗号分隔): ").split(","),
|
| 212 |
+
expected_answer=input("📄 标准答案 (可选): "),
|
| 213 |
+
difficulty=input("⭐ 难度 (easy/medium/hard) [medium]: ") or "medium",
|
| 214 |
+
category=input("🏷️ 类别 (code_finding/architecture/workflow/general) [general]: ") or "general",
|
| 215 |
+
language=input("🌍 语言 (en/zh) [en]: ") or "en"
|
| 216 |
+
)
|
| 217 |
+
builder.add_sample(sample)
|
| 218 |
+
print("✅ 样本已添加")
|
| 219 |
+
|
| 220 |
+
elif choice == "2":
|
| 221 |
+
print(f"\n总共 {len(builder.samples)} 个样本:")
|
| 222 |
+
for s in builder.samples[-10:]: # 显示最后10个
|
| 223 |
+
print(f" - [{s.difficulty}] {s.query[:50]}")
|
| 224 |
+
|
| 225 |
+
elif choice == "3":
|
| 226 |
+
category = input("输入类别: ")
|
| 227 |
+
samples = builder.get_samples_by_category(category)
|
| 228 |
+
print(f"\n找到 {len(samples)} 个 '{category}' 类别的样本:")
|
| 229 |
+
for s in samples:
|
| 230 |
+
print(f" - {s.query}")
|
| 231 |
+
|
| 232 |
+
elif choice == "4":
|
| 233 |
+
stats = builder.get_statistics()
|
| 234 |
+
print(f"\n📊 数据集统计:")
|
| 235 |
+
print(f" 总样本数: {stats['total']}")
|
| 236 |
+
print(f" 按类别: {stats['by_category']}")
|
| 237 |
+
print(f" 按难度: {stats['by_difficulty']}")
|
| 238 |
+
print(f" 按语言: {stats['by_language']}")
|
| 239 |
+
|
| 240 |
+
elif choice == "5":
|
| 241 |
+
builder.save()
|
| 242 |
+
print("✅ 数据集已保存")
|
| 243 |
+
break
|
| 244 |
+
|
| 245 |
+
elif choice == "0":
|
| 246 |
+
print("⚠️ 未保存,退出")
|
| 247 |
+
break
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# ============================================================================
|
| 251 |
+
# 自动评估数据集的完整性
|
| 252 |
+
# ============================================================================
|
| 253 |
+
|
| 254 |
+
def validate_golden_dataset(filepath: str = "evaluation/golden_dataset.json") -> Dict:
|
| 255 |
+
"""验证黄金数据集的完整性"""
|
| 256 |
+
|
| 257 |
+
builder = GoldenDatasetBuilder(filepath)
|
| 258 |
+
issues = {
|
| 259 |
+
"missing_fields": [],
|
| 260 |
+
"empty_queries": [],
|
| 261 |
+
"empty_files": [],
|
| 262 |
+
"duplicates": []
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
seen_queries = set()
|
| 266 |
+
|
| 267 |
+
for i, sample in enumerate(builder.samples):
|
| 268 |
+
# 检查必填字段
|
| 269 |
+
if not sample.query:
|
| 270 |
+
issues["empty_queries"].append(f"Sample {i}: query is empty")
|
| 271 |
+
|
| 272 |
+
if not sample.expected_files or all(not f for f in sample.expected_files):
|
| 273 |
+
issues["empty_files"].append(f"Sample {i}: expected_files is empty")
|
| 274 |
+
|
| 275 |
+
# 检查重复
|
| 276 |
+
if sample.query in seen_queries:
|
| 277 |
+
issues["duplicates"].append(f"Sample {i}: duplicate query")
|
| 278 |
+
seen_queries.add(sample.query)
|
| 279 |
+
|
| 280 |
+
return {
|
| 281 |
+
"valid": len(issues) == 0 or not any(issues.values()),
|
| 282 |
+
"total_samples": len(builder.samples),
|
| 283 |
+
"issues": issues,
|
| 284 |
+
"stats": builder.get_statistics()
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
# ============================================================================
|
| 289 |
+
# 快速初始化脚本
|
| 290 |
+
# ============================================================================
|
| 291 |
+
|
| 292 |
+
def init_github_agent_dataset():
|
| 293 |
+
"""快速初始化 GitHub Agent 项目的数据集"""
|
| 294 |
+
builder = GoldenDatasetBuilder("evaluation/golden_dataset.json")
|
| 295 |
+
|
| 296 |
+
# 清空现有 (可选)
|
| 297 |
+
# builder.samples = []
|
| 298 |
+
|
| 299 |
+
# 添加初始样本
|
| 300 |
+
builder.add_samples_batch(GITHUB_AGENT_GOLDEN_SAMPLES)
|
| 301 |
+
|
| 302 |
+
# 额外添加更多样本 (扩展到30+)
|
| 303 |
+
extra_samples = [
|
| 304 |
+
GoldenSample(
|
| 305 |
+
id="",
|
| 306 |
+
description="向量检索质量",
|
| 307 |
+
query="What retrieval metrics are tracked?",
|
| 308 |
+
expected_files=["evaluation/evaluation_framework.py"],
|
| 309 |
+
difficulty="medium",
|
| 310 |
+
category="architecture"
|
| 311 |
+
),
|
| 312 |
+
GoldenSample(
|
| 313 |
+
id="",
|
| 314 |
+
description="Agent决策过程",
|
| 315 |
+
query="How does the agent decide which files to read?",
|
| 316 |
+
expected_files=["app/services/agent_service.py"],
|
| 317 |
+
difficulty="hard",
|
| 318 |
+
category="workflow"
|
| 319 |
+
),
|
| 320 |
+
GoldenSample(
|
| 321 |
+
id="",
|
| 322 |
+
description="错误处理",
|
| 323 |
+
query="Where are network timeout errors handled?",
|
| 324 |
+
expected_files=["app/services/agent_service.py", "app/services/chat_service.py"],
|
| 325 |
+
difficulty="medium",
|
| 326 |
+
category="code_finding"
|
| 327 |
+
),
|
| 328 |
+
]
|
| 329 |
+
builder.add_samples_batch(extra_samples)
|
| 330 |
+
builder.save()
|
| 331 |
+
|
| 332 |
+
print(f"✅ 初始化完成: {len(builder.samples)} 个样本")
|
| 333 |
+
print(f"📊 {builder.get_statistics()}")
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
# ============================================================================
|
| 337 |
+
# 导出为 Ragas 格式
|
| 338 |
+
# ============================================================================
|
| 339 |
+
|
| 340 |
+
def export_to_ragas_format(golden_filepath: str, output_filepath: str = "evaluation/ragas_eval_dataset.json"):
|
| 341 |
+
"""
|
| 342 |
+
将黄金数据集导出为 Ragas 评估框架所需的格式
|
| 343 |
+
|
| 344 |
+
Ragas 格式:
|
| 345 |
+
{
|
| 346 |
+
"questions": [...],
|
| 347 |
+
"contexts": [...],
|
| 348 |
+
"ground_truths": [...]
|
| 349 |
+
}
|
| 350 |
+
"""
|
| 351 |
+
builder = GoldenDatasetBuilder(golden_filepath)
|
| 352 |
+
|
| 353 |
+
ragas_data = {
|
| 354 |
+
"questions": [],
|
| 355 |
+
"contexts": [],
|
| 356 |
+
"ground_truths": [],
|
| 357 |
+
"metadata": []
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
for sample in builder.samples:
|
| 361 |
+
ragas_data["questions"].append(sample.query)
|
| 362 |
+
ragas_data["ground_truths"].append({
|
| 363 |
+
"answer": sample.expected_answer,
|
| 364 |
+
"files": sample.expected_files
|
| 365 |
+
})
|
| 366 |
+
ragas_data["contexts"].append("\n".join(sample.expected_files))
|
| 367 |
+
ragas_data["metadata"].append({
|
| 368 |
+
"difficulty": sample.difficulty,
|
| 369 |
+
"category": sample.category,
|
| 370 |
+
"description": sample.description
|
| 371 |
+
})
|
| 372 |
+
|
| 373 |
+
os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
|
| 374 |
+
with open(output_filepath, 'w', encoding='utf-8') as f:
|
| 375 |
+
json.dump(ragas_data, f, ensure_ascii=False, indent=2)
|
| 376 |
+
|
| 377 |
+
print(f"✅ Exported to {output_filepath}")
|
| 378 |
+
print(f" Questions: {len(ragas_data['questions'])}")
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
# ============================================================================
|
| 382 |
+
# 命令行接口
|
| 383 |
+
# ============================================================================
|
| 384 |
+
|
| 385 |
+
if __name__ == "__main__":
|
| 386 |
+
import sys
|
| 387 |
+
|
| 388 |
+
if len(sys.argv) > 1:
|
| 389 |
+
command = sys.argv[1]
|
| 390 |
+
|
| 391 |
+
if command == "init":
|
| 392 |
+
init_github_agent_dataset()
|
| 393 |
+
|
| 394 |
+
elif command == "validate":
|
| 395 |
+
result = validate_golden_dataset()
|
| 396 |
+
print(json.dumps(result, indent=2, ensure_ascii=False))
|
| 397 |
+
|
| 398 |
+
elif command == "export-ragas":
|
| 399 |
+
export_to_ragas_format("evaluation/golden_dataset.json")
|
| 400 |
+
|
| 401 |
+
elif command == "interactive":
|
| 402 |
+
interactive_builder()
|
| 403 |
+
|
| 404 |
+
else:
|
| 405 |
+
print(f"Unknown command: {command}")
|
| 406 |
+
|
| 407 |
+
else:
|
| 408 |
+
print("黄金数据集构建工具")
|
| 409 |
+
print()
|
| 410 |
+
print("用法:")
|
| 411 |
+
print(" python golden_dataset_builder.py init # 快速初始化")
|
| 412 |
+
print(" python golden_dataset_builder.py validate # 验证数据集")
|
| 413 |
+
print(" python golden_dataset_builder.py export-ragas # 导出为Ragas格式")
|
| 414 |
+
print(" python golden_dataset_builder.py interactive # 交互式构建")
|
evaluation/models.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: evaluation/models.py
|
| 2 |
+
"""
|
| 3 |
+
评估数据模型定义
|
| 4 |
+
|
| 5 |
+
将所有数据类和枚举集中管理,保持代码职责清晰
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from dataclasses import dataclass, field, asdict
|
| 9 |
+
from typing import List, Dict, Optional, Any
|
| 10 |
+
from enum import Enum
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class EvaluationLayer(Enum):
|
| 15 |
+
"""评估层次分类"""
|
| 16 |
+
QUERY_REWRITE = "query_rewrite"
|
| 17 |
+
RETRIEVAL = "retrieval"
|
| 18 |
+
GENERATION = "generation"
|
| 19 |
+
AGENTIC = "agentic"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class DataQualityTier(Enum):
|
| 23 |
+
"""数据质量分级 (用于SFT数据路由)"""
|
| 24 |
+
GOLD = "gold" # 完美样本 (score > 0.9)
|
| 25 |
+
SILVER = "silver" # 优质样本 (score 0.7-0.9)
|
| 26 |
+
BRONZE = "bronze" # 可用样本 (score 0.5-0.7)
|
| 27 |
+
REJECTED = "rejected" # 拒绝 (score < 0.5)
|
| 28 |
+
CORRECTED = "corrected" # 自纠正后的样本 (用于DPO)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ============================================================================
|
| 32 |
+
# 各层评估指标
|
| 33 |
+
# ============================================================================
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class QueryRewriteMetrics:
|
| 37 |
+
"""查询重写评估指标"""
|
| 38 |
+
original_query: str
|
| 39 |
+
rewritten_query: str
|
| 40 |
+
language_detected: str
|
| 41 |
+
keyword_coverage: float # 0-1
|
| 42 |
+
semantic_preservation: float # 0-1
|
| 43 |
+
diversity_score: float # 0-1
|
| 44 |
+
|
| 45 |
+
def overall_score(self) -> float:
|
| 46 |
+
return (
|
| 47 |
+
self.keyword_coverage * 0.4 +
|
| 48 |
+
self.semantic_preservation * 0.4 +
|
| 49 |
+
self.diversity_score * 0.2
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class RetrievalMetrics:
|
| 55 |
+
"""检索层评估指标"""
|
| 56 |
+
query: str
|
| 57 |
+
top_k: int
|
| 58 |
+
|
| 59 |
+
# 核心指标
|
| 60 |
+
hit_rate: float
|
| 61 |
+
recall_at_k: float
|
| 62 |
+
precision_at_k: float
|
| 63 |
+
mrr: float # Mean Reciprocal Rank
|
| 64 |
+
|
| 65 |
+
# 高级指标
|
| 66 |
+
context_relevance: float
|
| 67 |
+
chunk_integrity: float
|
| 68 |
+
retrieval_latency_ms: float
|
| 69 |
+
|
| 70 |
+
# 混合检索
|
| 71 |
+
vector_score_avg: float
|
| 72 |
+
bm25_score_avg: float
|
| 73 |
+
|
| 74 |
+
retrieved_files: List[str] = field(default_factory=list)
|
| 75 |
+
ground_truth_files: List[str] = field(default_factory=list)
|
| 76 |
+
|
| 77 |
+
def overall_score(self) -> float:
|
| 78 |
+
return (
|
| 79 |
+
self.recall_at_k * 0.3 +
|
| 80 |
+
self.precision_at_k * 0.3 +
|
| 81 |
+
self.context_relevance * 0.25 +
|
| 82 |
+
self.chunk_integrity * 0.15
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
@dataclass
|
| 87 |
+
class GenerationMetrics:
|
| 88 |
+
"""生成层评估指标"""
|
| 89 |
+
query: str
|
| 90 |
+
retrieved_context: str
|
| 91 |
+
generated_answer: str
|
| 92 |
+
|
| 93 |
+
# 核心指标
|
| 94 |
+
faithfulness: float
|
| 95 |
+
answer_relevance: float
|
| 96 |
+
answer_completeness: float
|
| 97 |
+
code_correctness: float
|
| 98 |
+
|
| 99 |
+
# 可选
|
| 100 |
+
ground_truth_answer: str = ""
|
| 101 |
+
hallucination_count: int = 0
|
| 102 |
+
unsupported_claims: List[str] = field(default_factory=list)
|
| 103 |
+
generated_code_samples: List[str] = field(default_factory=list)
|
| 104 |
+
generation_latency_ms: float = 0
|
| 105 |
+
token_usage: Dict[str, int] = field(default_factory=lambda: {"input": 0, "output": 0})
|
| 106 |
+
|
| 107 |
+
def overall_score(self) -> float:
|
| 108 |
+
base_score = (
|
| 109 |
+
self.faithfulness * 0.35 +
|
| 110 |
+
self.answer_relevance * 0.35 +
|
| 111 |
+
self.answer_completeness * 0.2 +
|
| 112 |
+
self.code_correctness * 0.1
|
| 113 |
+
)
|
| 114 |
+
penalty = self.hallucination_count * 0.1
|
| 115 |
+
return max(0, base_score - penalty)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@dataclass
|
| 119 |
+
class AgenticMetrics:
|
| 120 |
+
"""Agent行为评估指标"""
|
| 121 |
+
query: str
|
| 122 |
+
tool_selection_accuracy: float
|
| 123 |
+
tool_parameter_correctness: float
|
| 124 |
+
|
| 125 |
+
tool_calls: List[Dict[str, Any]] = field(default_factory=list)
|
| 126 |
+
steps_taken: int = 0
|
| 127 |
+
unnecessary_steps: int = 0
|
| 128 |
+
backtrack_count: int = 0
|
| 129 |
+
success: bool = True
|
| 130 |
+
early_termination: bool = False
|
| 131 |
+
end_to_end_latency_ms: float = 0
|
| 132 |
+
|
| 133 |
+
def efficiency_score(self) -> float:
|
| 134 |
+
if self.steps_taken == 0:
|
| 135 |
+
return 0
|
| 136 |
+
redundancy_ratio = self.unnecessary_steps / self.steps_taken
|
| 137 |
+
return max(0, min(1, 1 - redundancy_ratio - self.backtrack_count * 0.1))
|
| 138 |
+
|
| 139 |
+
def overall_score(self) -> float:
|
| 140 |
+
return (
|
| 141 |
+
self.tool_selection_accuracy * 0.4 +
|
| 142 |
+
self.tool_parameter_correctness * 0.3 +
|
| 143 |
+
self.efficiency_score() * 0.2 +
|
| 144 |
+
(1.0 if self.success else 0.0) * 0.1
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ============================================================================
|
| 149 |
+
# 综合评估结果
|
| 150 |
+
# ============================================================================
|
| 151 |
+
|
| 152 |
+
@dataclass
|
| 153 |
+
class EvaluationResult:
|
| 154 |
+
"""单次评估完整结果"""
|
| 155 |
+
session_id: str
|
| 156 |
+
query: str
|
| 157 |
+
repo_url: str
|
| 158 |
+
timestamp: datetime
|
| 159 |
+
language: str = "en"
|
| 160 |
+
|
| 161 |
+
# 各层评估结果
|
| 162 |
+
query_rewrite_metrics: Optional[QueryRewriteMetrics] = None
|
| 163 |
+
retrieval_metrics: Optional[RetrievalMetrics] = None
|
| 164 |
+
generation_metrics: Optional[GenerationMetrics] = None
|
| 165 |
+
agentic_metrics: Optional[AgenticMetrics] = None
|
| 166 |
+
|
| 167 |
+
# 综合评分
|
| 168 |
+
overall_score: float = 0.0
|
| 169 |
+
data_quality_tier: DataQualityTier = DataQualityTier.BRONZE
|
| 170 |
+
|
| 171 |
+
# SFT标注
|
| 172 |
+
sft_ready: bool = False
|
| 173 |
+
dpo_candidate: bool = False
|
| 174 |
+
|
| 175 |
+
# 元数据
|
| 176 |
+
error_message: Optional[str] = None
|
| 177 |
+
notes: str = ""
|
| 178 |
+
|
| 179 |
+
def compute_overall_score(self) -> float:
|
| 180 |
+
"""计算加权综合得分"""
|
| 181 |
+
scores, weights = [], []
|
| 182 |
+
|
| 183 |
+
if self.query_rewrite_metrics:
|
| 184 |
+
scores.append(self.query_rewrite_metrics.overall_score())
|
| 185 |
+
weights.append(0.15)
|
| 186 |
+
|
| 187 |
+
if self.retrieval_metrics:
|
| 188 |
+
scores.append(self.retrieval_metrics.overall_score())
|
| 189 |
+
weights.append(0.35)
|
| 190 |
+
|
| 191 |
+
if self.generation_metrics:
|
| 192 |
+
scores.append(self.generation_metrics.overall_score())
|
| 193 |
+
weights.append(0.4)
|
| 194 |
+
|
| 195 |
+
if self.agentic_metrics:
|
| 196 |
+
scores.append(self.agentic_metrics.overall_score())
|
| 197 |
+
weights.append(0.1)
|
| 198 |
+
|
| 199 |
+
if not scores:
|
| 200 |
+
return 0.0
|
| 201 |
+
|
| 202 |
+
total_weight = sum(weights)
|
| 203 |
+
self.overall_score = sum(s * w for s, w in zip(scores, weights)) / total_weight
|
| 204 |
+
|
| 205 |
+
# 分级
|
| 206 |
+
if self.overall_score > 0.9:
|
| 207 |
+
self.data_quality_tier = DataQualityTier.GOLD
|
| 208 |
+
self.sft_ready = True
|
| 209 |
+
elif self.overall_score > 0.7:
|
| 210 |
+
self.data_quality_tier = DataQualityTier.SILVER
|
| 211 |
+
self.sft_ready = True
|
| 212 |
+
elif self.overall_score > 0.5:
|
| 213 |
+
self.data_quality_tier = DataQualityTier.BRONZE
|
| 214 |
+
else:
|
| 215 |
+
self.data_quality_tier = DataQualityTier.REJECTED
|
| 216 |
+
|
| 217 |
+
return self.overall_score
|
| 218 |
+
|
| 219 |
+
def to_dict(self) -> Dict:
|
| 220 |
+
"""转换为字典供存储"""
|
| 221 |
+
result = {
|
| 222 |
+
"session_id": self.session_id,
|
| 223 |
+
"query": self.query,
|
| 224 |
+
"repo_url": self.repo_url,
|
| 225 |
+
"timestamp": self.timestamp.isoformat(),
|
| 226 |
+
"language": self.language,
|
| 227 |
+
"overall_score": self.overall_score,
|
| 228 |
+
"data_quality_tier": self.data_quality_tier.value,
|
| 229 |
+
"sft_ready": self.sft_ready,
|
| 230 |
+
"dpo_candidate": self.dpo_candidate,
|
| 231 |
+
"error_message": self.error_message,
|
| 232 |
+
"notes": self.notes,
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
if self.query_rewrite_metrics:
|
| 236 |
+
result["query_rewrite"] = asdict(self.query_rewrite_metrics)
|
| 237 |
+
if self.retrieval_metrics:
|
| 238 |
+
result["retrieval"] = asdict(self.retrieval_metrics)
|
| 239 |
+
if self.generation_metrics:
|
| 240 |
+
result["generation"] = asdict(self.generation_metrics)
|
| 241 |
+
if self.agentic_metrics:
|
| 242 |
+
result["agentic"] = asdict(self.agentic_metrics)
|
| 243 |
+
|
| 244 |
+
return result
|
evaluation/test_retrieval.py
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
检索系统离线评估脚本
|
| 4 |
+
|
| 5 |
+
用于测试 chunking 和检索策略的准确率。
|
| 6 |
+
使用 golden_dataset.json 中的标注数据作为 ground truth。
|
| 7 |
+
|
| 8 |
+
使用方法:
|
| 9 |
+
python evaluation/test_retrieval.py --repo https://github.com/tiangolo/fastapi
|
| 10 |
+
python evaluation/test_retrieval.py --repo https://github.com/tiangolo/fastapi --top-k 5
|
| 11 |
+
python evaluation/test_retrieval.py --repo https://github.com/tiangolo/fastapi --verbose
|
| 12 |
+
|
| 13 |
+
Author: Dexter
|
| 14 |
+
Date: 2026-01-28
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
import asyncio
|
| 21 |
+
import argparse
|
| 22 |
+
from typing import List, Dict, Tuple
|
| 23 |
+
from dataclasses import dataclass, field
|
| 24 |
+
from datetime import datetime
|
| 25 |
+
|
| 26 |
+
# 添加项目根目录到 path
|
| 27 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 28 |
+
|
| 29 |
+
from app.services.vector_service import store_manager
|
| 30 |
+
from app.services.github_service import get_repo_structure
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class RetrievalTestResult:
|
| 35 |
+
"""单个测试用例的结果"""
|
| 36 |
+
query: str
|
| 37 |
+
expected_files: List[str]
|
| 38 |
+
retrieved_files: List[str]
|
| 39 |
+
hit: bool # 是否命中任意一个预期文件
|
| 40 |
+
recall: float # 召回率: 命中的预期文件 / 总预期文件
|
| 41 |
+
precision: float # 精确率: 命中的预期文件 / 检索结果数
|
| 42 |
+
reciprocal_rank: float # 倒数排名: 1 / 第一个命中的位置
|
| 43 |
+
difficulty: str = ""
|
| 44 |
+
category: str = ""
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@dataclass
|
| 48 |
+
class EvaluationReport:
|
| 49 |
+
"""完整评估报告"""
|
| 50 |
+
repo_url: str
|
| 51 |
+
top_k: int
|
| 52 |
+
total_queries: int
|
| 53 |
+
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
| 54 |
+
|
| 55 |
+
# 聚合指标
|
| 56 |
+
hit_rate: float = 0.0 # 命中率: 至少命中一个的查询比例
|
| 57 |
+
mean_recall: float = 0.0 # 平均召回率
|
| 58 |
+
mean_precision: float = 0.0 # 平均精确率
|
| 59 |
+
mrr: float = 0.0 # Mean Reciprocal Rank
|
| 60 |
+
|
| 61 |
+
# 按难度分组
|
| 62 |
+
by_difficulty: Dict[str, Dict] = field(default_factory=dict)
|
| 63 |
+
|
| 64 |
+
# 详细结果
|
| 65 |
+
results: List[RetrievalTestResult] = field(default_factory=list)
|
| 66 |
+
failed_cases: List[Dict] = field(default_factory=list)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class RetrievalEvaluator:
|
| 70 |
+
"""检索系统评估器"""
|
| 71 |
+
|
| 72 |
+
def __init__(self, golden_dataset_path: str = "evaluation/golden_dataset.json"):
|
| 73 |
+
self.golden_dataset = self._load_golden_dataset(golden_dataset_path)
|
| 74 |
+
print(f"📊 Loaded {len(self.golden_dataset)} test cases from golden dataset")
|
| 75 |
+
|
| 76 |
+
def _load_golden_dataset(self, path: str) -> List[Dict]:
|
| 77 |
+
"""加载黄金数据集"""
|
| 78 |
+
if not os.path.exists(path):
|
| 79 |
+
raise FileNotFoundError(f"Golden dataset not found: {path}")
|
| 80 |
+
|
| 81 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 82 |
+
return json.load(f)
|
| 83 |
+
|
| 84 |
+
async def evaluate(
|
| 85 |
+
self,
|
| 86 |
+
repo_url: str,
|
| 87 |
+
session_id: str = "eval_test",
|
| 88 |
+
top_k: int = 5,
|
| 89 |
+
verbose: bool = False
|
| 90 |
+
) -> EvaluationReport:
|
| 91 |
+
"""
|
| 92 |
+
运行完整的检索评估
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
repo_url: 要评估的仓库 URL
|
| 96 |
+
session_id: 会话 ID
|
| 97 |
+
top_k: 每次检索返回的文件数
|
| 98 |
+
verbose: 是否打印详细信息
|
| 99 |
+
"""
|
| 100 |
+
print(f"\n{'='*60}")
|
| 101 |
+
print(f"🔍 Retrieval Evaluation")
|
| 102 |
+
print(f"{'='*60}")
|
| 103 |
+
print(f"Repository: {repo_url}")
|
| 104 |
+
print(f"Top-K: {top_k}")
|
| 105 |
+
print(f"Test Cases: {len(self.golden_dataset)}")
|
| 106 |
+
print(f"{'='*60}\n")
|
| 107 |
+
|
| 108 |
+
# 获取仓库文件列表
|
| 109 |
+
print("📂 Fetching repository structure...")
|
| 110 |
+
file_list = get_repo_structure(repo_url) # 同步函数,不需要 await
|
| 111 |
+
print(f" Found {len(file_list)} files")
|
| 112 |
+
|
| 113 |
+
# 获取向量存储
|
| 114 |
+
store = store_manager.get_store(session_id)
|
| 115 |
+
chunk_count = store.collection.count() # 使用 collection.count()
|
| 116 |
+
if chunk_count == 0:
|
| 117 |
+
print("\n⚠️ Vector store is empty!")
|
| 118 |
+
print(" Please run the agent first to index the repository.")
|
| 119 |
+
print(" Example: Access http://localhost:8000 and analyze the repo first.")
|
| 120 |
+
return None
|
| 121 |
+
print(f" Vector store has {chunk_count} chunks")
|
| 122 |
+
|
| 123 |
+
# 运行评估
|
| 124 |
+
report = EvaluationReport(
|
| 125 |
+
repo_url=repo_url,
|
| 126 |
+
top_k=top_k,
|
| 127 |
+
total_queries=len(self.golden_dataset)
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
hits = 0
|
| 131 |
+
recalls = []
|
| 132 |
+
precisions = []
|
| 133 |
+
reciprocal_ranks = []
|
| 134 |
+
|
| 135 |
+
difficulty_stats = {}
|
| 136 |
+
|
| 137 |
+
for i, sample in enumerate(self.golden_dataset):
|
| 138 |
+
query = sample.get("query", "")
|
| 139 |
+
expected_files = sample.get("expected_files", [])
|
| 140 |
+
difficulty = sample.get("difficulty", "medium")
|
| 141 |
+
category = sample.get("category", "general")
|
| 142 |
+
|
| 143 |
+
if not query or not expected_files:
|
| 144 |
+
continue
|
| 145 |
+
|
| 146 |
+
# 执行检索 (使用 hybrid search)
|
| 147 |
+
try:
|
| 148 |
+
results = await store.search_hybrid(query, top_k=top_k)
|
| 149 |
+
except Exception as e:
|
| 150 |
+
if verbose:
|
| 151 |
+
print(f" [ERR] Search failed: {e}")
|
| 152 |
+
continue
|
| 153 |
+
|
| 154 |
+
# 提取检索到的文件路径
|
| 155 |
+
retrieved_files = []
|
| 156 |
+
for doc in results:
|
| 157 |
+
if isinstance(doc, dict):
|
| 158 |
+
file_path = doc.get("file", "")
|
| 159 |
+
if file_path and file_path not in retrieved_files:
|
| 160 |
+
retrieved_files.append(file_path)
|
| 161 |
+
|
| 162 |
+
# 计算指标
|
| 163 |
+
expected_set = set(expected_files)
|
| 164 |
+
retrieved_set = set(retrieved_files[:top_k])
|
| 165 |
+
|
| 166 |
+
# 命中的文件
|
| 167 |
+
hits_set = expected_set & retrieved_set
|
| 168 |
+
|
| 169 |
+
# Hit: 是否命中任意一个
|
| 170 |
+
hit = len(hits_set) > 0
|
| 171 |
+
if hit:
|
| 172 |
+
hits += 1
|
| 173 |
+
|
| 174 |
+
# Recall: 命中的 / 期望的
|
| 175 |
+
recall = len(hits_set) / len(expected_set) if expected_set else 0
|
| 176 |
+
recalls.append(recall)
|
| 177 |
+
|
| 178 |
+
# Precision: 命中的 / 检索的
|
| 179 |
+
precision = len(hits_set) / min(len(retrieved_files), top_k) if retrieved_files else 0
|
| 180 |
+
precisions.append(precision)
|
| 181 |
+
|
| 182 |
+
# Reciprocal Rank: 1 / 第一个命中的位置
|
| 183 |
+
rr = 0.0
|
| 184 |
+
for rank, file in enumerate(retrieved_files[:top_k], 1):
|
| 185 |
+
if file in expected_set:
|
| 186 |
+
rr = 1.0 / rank
|
| 187 |
+
break
|
| 188 |
+
reciprocal_ranks.append(rr)
|
| 189 |
+
|
| 190 |
+
# 记录结果
|
| 191 |
+
result = RetrievalTestResult(
|
| 192 |
+
query=query,
|
| 193 |
+
expected_files=expected_files,
|
| 194 |
+
retrieved_files=retrieved_files[:top_k],
|
| 195 |
+
hit=hit,
|
| 196 |
+
recall=recall,
|
| 197 |
+
precision=precision,
|
| 198 |
+
reciprocal_rank=rr,
|
| 199 |
+
difficulty=difficulty,
|
| 200 |
+
category=category
|
| 201 |
+
)
|
| 202 |
+
report.results.append(result)
|
| 203 |
+
|
| 204 |
+
# 按难度统计
|
| 205 |
+
if difficulty not in difficulty_stats:
|
| 206 |
+
difficulty_stats[difficulty] = {"hits": 0, "total": 0, "recalls": [], "precisions": []}
|
| 207 |
+
difficulty_stats[difficulty]["total"] += 1
|
| 208 |
+
if hit:
|
| 209 |
+
difficulty_stats[difficulty]["hits"] += 1
|
| 210 |
+
difficulty_stats[difficulty]["recalls"].append(recall)
|
| 211 |
+
difficulty_stats[difficulty]["precisions"].append(precision)
|
| 212 |
+
|
| 213 |
+
# 记录失败案例
|
| 214 |
+
if not hit:
|
| 215 |
+
report.failed_cases.append({
|
| 216 |
+
"query": query,
|
| 217 |
+
"expected": expected_files,
|
| 218 |
+
"retrieved": retrieved_files[:top_k],
|
| 219 |
+
"difficulty": difficulty
|
| 220 |
+
})
|
| 221 |
+
|
| 222 |
+
# 打印进度
|
| 223 |
+
if verbose:
|
| 224 |
+
status = "✅" if hit else "❌"
|
| 225 |
+
print(f" [{i+1:3d}] {status} Recall={recall:.2f} | {query[:50]}...")
|
| 226 |
+
else:
|
| 227 |
+
print(f"\r Progress: {i+1}/{len(self.golden_dataset)}", end="")
|
| 228 |
+
|
| 229 |
+
print("\n")
|
| 230 |
+
|
| 231 |
+
# 计算聚合指标
|
| 232 |
+
report.hit_rate = hits / len(self.golden_dataset) if self.golden_dataset else 0
|
| 233 |
+
report.mean_recall = sum(recalls) / len(recalls) if recalls else 0
|
| 234 |
+
report.mean_precision = sum(precisions) / len(precisions) if precisions else 0
|
| 235 |
+
report.mrr = sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0
|
| 236 |
+
|
| 237 |
+
# 按难度汇总
|
| 238 |
+
for diff, stats in difficulty_stats.items():
|
| 239 |
+
report.by_difficulty[diff] = {
|
| 240 |
+
"hit_rate": stats["hits"] / stats["total"] if stats["total"] else 0,
|
| 241 |
+
"mean_recall": sum(stats["recalls"]) / len(stats["recalls"]) if stats["recalls"] else 0,
|
| 242 |
+
"mean_precision": sum(stats["precisions"]) / len(stats["precisions"]) if stats["precisions"] else 0,
|
| 243 |
+
"total": stats["total"]
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
return report
|
| 247 |
+
|
| 248 |
+
def print_report(self, report: EvaluationReport):
|
| 249 |
+
"""打印评估报告"""
|
| 250 |
+
print(f"\n{'='*60}")
|
| 251 |
+
print(f"📊 RETRIEVAL EVALUATION REPORT")
|
| 252 |
+
print(f"{'='*60}")
|
| 253 |
+
print(f"Repository: {report.repo_url}")
|
| 254 |
+
print(f"Top-K: {report.top_k}")
|
| 255 |
+
print(f"Total Queries: {report.total_queries}")
|
| 256 |
+
print(f"Timestamp: {report.timestamp}")
|
| 257 |
+
print(f"{'='*60}\n")
|
| 258 |
+
|
| 259 |
+
print("📈 OVERALL METRICS")
|
| 260 |
+
print(f" Hit Rate: {report.hit_rate:.1%}")
|
| 261 |
+
print(f" Mean Recall: {report.mean_recall:.1%}")
|
| 262 |
+
print(f" Mean Precision: {report.mean_precision:.1%}")
|
| 263 |
+
print(f" MRR: {report.mrr:.3f}")
|
| 264 |
+
|
| 265 |
+
print(f"\n📊 BY DIFFICULTY")
|
| 266 |
+
for diff, stats in sorted(report.by_difficulty.items()):
|
| 267 |
+
print(f" {diff.upper():8s} | Hit: {stats['hit_rate']:.1%} | Recall: {stats['mean_recall']:.1%} | n={stats['total']}")
|
| 268 |
+
|
| 269 |
+
if report.failed_cases:
|
| 270 |
+
print(f"\n❌ FAILED CASES ({len(report.failed_cases)} total)")
|
| 271 |
+
for case in report.failed_cases[:5]: # 只显示前5个
|
| 272 |
+
print(f" Query: {case['query'][:60]}...")
|
| 273 |
+
print(f" Expected: {case['expected']}")
|
| 274 |
+
print(f" Got: {case['retrieved'][:3]}...")
|
| 275 |
+
print()
|
| 276 |
+
|
| 277 |
+
print(f"{'='*60}")
|
| 278 |
+
|
| 279 |
+
def save_report(self, report: EvaluationReport, output_path: str = "evaluation/retrieval_report.json"):
|
| 280 |
+
"""保存报告到文件"""
|
| 281 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 282 |
+
|
| 283 |
+
# 转换为可序列化格式
|
| 284 |
+
data = {
|
| 285 |
+
"repo_url": report.repo_url,
|
| 286 |
+
"top_k": report.top_k,
|
| 287 |
+
"total_queries": report.total_queries,
|
| 288 |
+
"timestamp": report.timestamp,
|
| 289 |
+
"metrics": {
|
| 290 |
+
"hit_rate": report.hit_rate,
|
| 291 |
+
"mean_recall": report.mean_recall,
|
| 292 |
+
"mean_precision": report.mean_precision,
|
| 293 |
+
"mrr": report.mrr
|
| 294 |
+
},
|
| 295 |
+
"by_difficulty": report.by_difficulty,
|
| 296 |
+
"failed_cases": report.failed_cases
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 300 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 301 |
+
|
| 302 |
+
print(f"\n💾 Report saved to: {output_path}")
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
async def main():
|
| 306 |
+
parser = argparse.ArgumentParser(description="Evaluate retrieval system using golden dataset")
|
| 307 |
+
parser.add_argument("--repo", required=True, help="GitHub repository URL to evaluate")
|
| 308 |
+
parser.add_argument("--top-k", type=int, default=5, help="Number of results to retrieve (default: 5)")
|
| 309 |
+
parser.add_argument("--session", default="eval_test", help="Session ID for vector store")
|
| 310 |
+
parser.add_argument("--verbose", "-v", action="store_true", help="Print detailed results")
|
| 311 |
+
parser.add_argument("--save", action="store_true", help="Save report to file")
|
| 312 |
+
|
| 313 |
+
args = parser.parse_args()
|
| 314 |
+
|
| 315 |
+
evaluator = RetrievalEvaluator()
|
| 316 |
+
report = await evaluator.evaluate(
|
| 317 |
+
repo_url=args.repo,
|
| 318 |
+
session_id=args.session,
|
| 319 |
+
top_k=args.top_k,
|
| 320 |
+
verbose=args.verbose
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
if report:
|
| 324 |
+
evaluator.print_report(report)
|
| 325 |
+
if args.save:
|
| 326 |
+
evaluator.save_report(report)
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
if __name__ == "__main__":
|
| 330 |
+
asyncio.run(main())
|
evaluation/utils.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 文件路径: evaluation/utils.py
|
| 2 |
+
"""
|
| 3 |
+
评估模块公共工具函数和常量
|
| 4 |
+
|
| 5 |
+
将重复的逻辑抽取到这里,保持代码 DRY (Don't Repeat Yourself)
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import List
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# ============================================================================
|
| 12 |
+
# 闲聊/无效 Query 检测
|
| 13 |
+
# ============================================================================
|
| 14 |
+
|
| 15 |
+
CHATTY_PATTERNS: List[str] = [
|
| 16 |
+
# 中文闲聊
|
| 17 |
+
"你好", "您好", "嗨", "在吗", "在不在", "谢谢", "多谢", "再见", "拜拜",
|
| 18 |
+
"什么是", "你是谁", "你叫什么", "帮帮我", "教教我",
|
| 19 |
+
# 英文闲聊
|
| 20 |
+
"hello", "hi", "hey", "thanks", "thank you", "bye", "goodbye",
|
| 21 |
+
"what is", "who are you", "help me", "can you",
|
| 22 |
+
# 单词/简短
|
| 23 |
+
"test", "测试", "ok", "yes", "no",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
# 代码语言指示符
|
| 27 |
+
CODE_INDICATORS: List[str] = [
|
| 28 |
+
# Python
|
| 29 |
+
"def ", "class ", "import ", "from ",
|
| 30 |
+
# JavaScript/TypeScript
|
| 31 |
+
"function ", "const ", "let ", "var ",
|
| 32 |
+
# Java/C#
|
| 33 |
+
"public ", "private ", "void ",
|
| 34 |
+
# Go
|
| 35 |
+
"func ", "package ",
|
| 36 |
+
# 通用
|
| 37 |
+
"```", # Markdown 代码块
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def is_chatty_query(query: str, min_length: int = 5) -> bool:
|
| 42 |
+
"""
|
| 43 |
+
检测是否为闲聊/无效 query
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
query: 用户查询
|
| 47 |
+
min_length: 最小有效长度,低于此值视为无效
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
True 如果是闲聊/无效查询
|
| 51 |
+
"""
|
| 52 |
+
if not query:
|
| 53 |
+
return True
|
| 54 |
+
|
| 55 |
+
query_lower = query.lower().strip()
|
| 56 |
+
|
| 57 |
+
# 长度检查
|
| 58 |
+
if len(query_lower) < min_length:
|
| 59 |
+
return True
|
| 60 |
+
|
| 61 |
+
# 模式匹配
|
| 62 |
+
for pattern in CHATTY_PATTERNS:
|
| 63 |
+
if query_lower == pattern or query_lower.startswith(pattern + " "):
|
| 64 |
+
return True
|
| 65 |
+
|
| 66 |
+
return False
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def has_code_indicators(text: str) -> bool:
|
| 70 |
+
"""
|
| 71 |
+
检查文本是否包含代码指示符
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
text: 要检查的文本
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
True 如果包含代码特征
|
| 78 |
+
"""
|
| 79 |
+
if not text:
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
for indicator in CODE_INDICATORS:
|
| 83 |
+
if indicator in text:
|
| 84 |
+
return True
|
| 85 |
+
|
| 86 |
+
return False
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ============================================================================
|
| 90 |
+
# 文件操作工具
|
| 91 |
+
# ============================================================================
|
| 92 |
+
|
| 93 |
+
def append_jsonl(filepath: str, data: dict) -> None:
|
| 94 |
+
"""
|
| 95 |
+
追加一行 JSON 到 JSONL 文件
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
filepath: 文件路径
|
| 99 |
+
data: 要追加的数据字典
|
| 100 |
+
"""
|
| 101 |
+
import json
|
| 102 |
+
with open(filepath, 'a', encoding='utf-8') as f:
|
| 103 |
+
f.write(json.dumps(data, ensure_ascii=False) + '\n')
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def read_jsonl(filepath: str) -> list:
|
| 107 |
+
"""
|
| 108 |
+
读取 JSONL 文件
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
filepath: 文件路径
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
数据列表
|
| 115 |
+
"""
|
| 116 |
+
import json
|
| 117 |
+
import os
|
| 118 |
+
|
| 119 |
+
if not os.path.exists(filepath):
|
| 120 |
+
return []
|
| 121 |
+
|
| 122 |
+
results = []
|
| 123 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 124 |
+
for line in f:
|
| 125 |
+
try:
|
| 126 |
+
results.append(json.loads(line))
|
| 127 |
+
except json.JSONDecodeError:
|
| 128 |
+
continue
|
| 129 |
+
return results
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def safe_truncate(text: str, max_length: int, suffix: str = "\n... [truncated]") -> str:
|
| 133 |
+
"""
|
| 134 |
+
安全截断文本
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
text: 原始文本
|
| 138 |
+
max_length: 最大长度
|
| 139 |
+
suffix: 截断后缀
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
截断后的文本
|
| 143 |
+
"""
|
| 144 |
+
if not text or len(text) <= max_length:
|
| 145 |
+
return text
|
| 146 |
+
return text[:max_length] + suffix
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def smart_truncate(text: str, max_length: int, keep_ratio: float = 0.7) -> str:
|
| 150 |
+
"""
|
| 151 |
+
智能截断:保留开头大部分 + 结尾小部分,适合代码上下文
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
text: 原始文本
|
| 155 |
+
max_length: 最大长度
|
| 156 |
+
keep_ratio: 开头保留比例(默认 70% 开头,30% 结尾)
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
截断后的文本,保留首尾关键内容
|
| 160 |
+
"""
|
| 161 |
+
if not text or len(text) <= max_length:
|
| 162 |
+
return text
|
| 163 |
+
|
| 164 |
+
separator = "\n\n... [中间内容已省略] ...\n\n"
|
| 165 |
+
available = max_length - len(separator)
|
| 166 |
+
|
| 167 |
+
if available <= 0:
|
| 168 |
+
return text[:max_length]
|
| 169 |
+
|
| 170 |
+
head_len = int(available * keep_ratio)
|
| 171 |
+
tail_len = available - head_len
|
| 172 |
+
|
| 173 |
+
return text[:head_len] + separator + text[-tail_len:]
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# ============================================================================
|
| 177 |
+
# SFT 数据长度配置
|
| 178 |
+
# ============================================================================
|
| 179 |
+
|
| 180 |
+
class SFTLengthConfig:
|
| 181 |
+
"""SFT 训练数据长度配置"""
|
| 182 |
+
|
| 183 |
+
# Context 限制(检索到的代码上下文)
|
| 184 |
+
MAX_CONTEXT_CHARS = 2500 # 最大字符数 (~800 tokens)
|
| 185 |
+
|
| 186 |
+
# Answer 限制(模型生成的回答)
|
| 187 |
+
MAX_ANSWER_CHARS = 3000 # 最大字符数 (~1000 tokens)
|
| 188 |
+
|
| 189 |
+
# Query 限制
|
| 190 |
+
MAX_QUERY_CHARS = 500 # 最大字符数
|
| 191 |
+
|
| 192 |
+
# 总体限制
|
| 193 |
+
MAX_TOTAL_CHARS = 6000 # 总字符数上限 (~2000 tokens)
|
| 194 |
+
|
| 195 |
+
# Token 估算(中英文混合,保守估计)
|
| 196 |
+
CHARS_PER_TOKEN = 3 # 平均每 token 的字符数
|
frontend-dist/assets/Tableau10-B-NsZVaP.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
function o(e){for(var c=e.length/6|0,n=new Array(c),a=0;a<c;)n[a]="#"+e.slice(a*6,++a*6);return n}const r=o("4e79a7f28e2ce1575976b7b259a14fedc949af7aa1ff9da79c755fbab0ab");export{r as s};
|
frontend-dist/assets/arc-BscbqCCW.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
import{w as ln,c as I}from"./path-CbwjOpE9.js";import{av as an,aw as j,ax as D,ay as rn,az as y,V as on,aA as K,aB as _,aC as un,aD as t,aE as tn,aF as sn,aG as fn}from"./index-BCNM9-Ly.js";function cn(l){return l.innerRadius}function yn(l){return l.outerRadius}function gn(l){return l.startAngle}function mn(l){return l.endAngle}function pn(l){return l&&l.padAngle}function xn(l,h,z,E,v,A,O,a){var B=z-l,i=E-h,n=O-v,m=a-A,r=m*B-n*i;if(!(r*r<y))return r=(n*(h-A)-m*(l-v))/r,[l+r*B,h+r*i]}function W(l,h,z,E,v,A,O){var a=l-z,B=h-E,i=(O?A:-A)/K(a*a+B*B),n=i*B,m=-i*a,r=l+n,s=h+m,f=z+n,c=E+m,S=(r+f)/2,o=(s+c)/2,p=f-r,g=c-s,R=p*p+g*g,T=v-A,w=r*c-f*s,C=(g<0?-1:1)*K(tn(0,T*T*R-w*w)),F=(w*g-p*C)/R,G=(-w*p-g*C)/R,P=(w*g+p*C)/R,x=(-w*p+g*C)/R,d=F-S,e=G-o,u=P-S,V=x-o;return d*d+e*e>u*u+V*V&&(F=P,G=x),{cx:F,cy:G,x01:-n,y01:-m,x11:F*(v/T-1),y11:G*(v/T-1)}}function vn(){var l=cn,h=yn,z=I(0),E=null,v=gn,A=mn,O=pn,a=null,B=ln(i);function i(){var n,m,r=+l.apply(this,arguments),s=+h.apply(this,arguments),f=v.apply(this,arguments)-rn,c=A.apply(this,arguments)-rn,S=un(c-f),o=c>f;if(a||(a=n=B()),s<r&&(m=s,s=r,r=m),!(s>y))a.moveTo(0,0);else if(S>on-y)a.moveTo(s*j(f),s*D(f)),a.arc(0,0,s,f,c,!o),r>y&&(a.moveTo(r*j(c),r*D(c)),a.arc(0,0,r,c,f,o));else{var p=f,g=c,R=f,T=c,w=S,C=S,F=O.apply(this,arguments)/2,G=F>y&&(E?+E.apply(this,arguments):K(r*r+s*s)),P=_(un(s-r)/2,+z.apply(this,arguments)),x=P,d=P,e,u;if(G>y){var V=sn(G/r*D(F)),L=sn(G/s*D(F));(w-=V*2)>y?(V*=o?1:-1,R+=V,T-=V):(w=0,R=T=(f+c)/2),(C-=L*2)>y?(L*=o?1:-1,p+=L,g-=L):(C=0,p=g=(f+c)/2)}var H=s*j(p),J=s*D(p),M=r*j(T),N=r*D(T);if(P>y){var Q=s*j(g),U=s*D(g),X=r*j(R),Y=r*D(R),q;if(S<an)if(q=xn(H,J,X,Y,Q,U,M,N)){var Z=H-q[0],$=J-q[1],k=Q-q[0],b=U-q[1],nn=1/D(fn((Z*k+$*b)/(K(Z*Z+$*$)*K(k*k+b*b)))/2),en=K(q[0]*q[0]+q[1]*q[1]);x=_(P,(r-en)/(nn-1)),d=_(P,(s-en)/(nn+1))}else x=d=0}C>y?d>y?(e=W(X,Y,H,J,s,d,o),u=W(Q,U,M,N,s,d,o),a.moveTo(e.cx+e.x01,e.cy+e.y01),d<P?a.arc(e.cx,e.cy,d,t(e.y01,e.x01),t(u.y01,u.x01),!o):(a.arc(e.cx,e.cy,d,t(e.y01,e.x01),t(e.y11,e.x11),!o),a.arc(0,0,s,t(e.cy+e.y11,e.cx+e.x11),t(u.cy+u.y11,u.cx+u.x11),!o),a.arc(u.cx,u.cy,d,t(u.y11,u.x11),t(u.y01,u.x01),!o))):(a.moveTo(H,J),a.arc(0,0,s,p,g,!o)):a.moveTo(H,J),!(r>y)||!(w>y)?a.lineTo(M,N):x>y?(e=W(M,N,Q,U,r,-x,o),u=W(H,J,X,Y,r,-x,o),a.lineTo(e.cx+e.x01,e.cy+e.y01),x<P?a.arc(e.cx,e.cy,x,t(e.y01,e.x01),t(u.y01,u.x01),!o):(a.arc(e.cx,e.cy,x,t(e.y01,e.x01),t(e.y11,e.x11),!o),a.arc(0,0,r,t(e.cy+e.y11,e.cx+e.x11),t(u.cy+u.y11,u.cx+u.x11),o),a.arc(u.cx,u.cy,x,t(u.y11,u.x11),t(u.y01,u.x01),!o))):a.arc(0,0,r,T,R,o)}if(a.closePath(),n)return a=null,n+""||null}return i.centroid=function(){var n=(+l.apply(this,arguments)+ +h.apply(this,arguments))/2,m=(+v.apply(this,arguments)+ +A.apply(this,arguments))/2-an/2;return[j(m)*n,D(m)*n]},i.innerRadius=function(n){return arguments.length?(l=typeof n=="function"?n:I(+n),i):l},i.outerRadius=function(n){return arguments.length?(h=typeof n=="function"?n:I(+n),i):h},i.cornerRadius=function(n){return arguments.length?(z=typeof n=="function"?n:I(+n),i):z},i.padRadius=function(n){return arguments.length?(E=n==null?null:typeof n=="function"?n:I(+n),i):E},i.startAngle=function(n){return arguments.length?(v=typeof n=="function"?n:I(+n),i):v},i.endAngle=function(n){return arguments.length?(A=typeof n=="function"?n:I(+n),i):A},i.padAngle=function(n){return arguments.length?(O=typeof n=="function"?n:I(+n),i):O},i.context=function(n){return arguments.length?(a=n??null,i):a},i}export{vn as a};
|
frontend-dist/assets/array-BKyUJesY.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
function t(r){return typeof r=="object"&&"length"in r?r:Array.from(r)}export{t as a};
|
frontend-dist/assets/blockDiagram-c4efeb88-CL85BYG9.js
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import{_ as se,d as H,e as ye,l as S,E as Ee,B as we,k as De,c as he,p as ve}from"./index-BCNM9-Ly.js";import{c as Ne}from"./clone-C4pHamD7.js";import{i as ke,c as Ie,b as Oe,d as Te,a as ge,p as ze}from"./edges-96097737-CqpaF4BI.js";import{G as Ce}from"./graph-CY8eBbAS.js";import{o as Ae}from"./ordinal-Cboi1Yqb.js";import{c as Re}from"./channel-DsKT-zfZ.js";import{s as Be}from"./Tableau10-B-NsZVaP.js";import"./createText-1719965b-BZ0xZVnk.js";import"./line-DdWeXrJe.js";import"./array-BKyUJesY.js";import"./path-CbwjOpE9.js";import"./init-Gi6I4Gst.js";var le,oe,ee=function(){var e=function(D,o,s,i){for(s=s||{},i=D.length;i--;s[D[i]]=o);return s},a=[1,7],d=[1,13],c=[1,14],n=[1,15],g=[1,19],l=[1,16],f=[1,17],b=[1,18],p=[8,30],x=[8,21,28,29,30,31,32,40,44,47],y=[1,23],T=[1,24],v=[8,15,16,21,28,29,30,31,32,40,44,47],N=[8,15,16,21,27,28,29,30,31,32,40,44,47],E=[1,49],L={trace:function(){},yy:{},symbols_:{error:2,spaceLines:3,SPACELINE:4,NL:5,separator:6,SPACE:7,EOF:8,start:9,BLOCK_DIAGRAM_KEY:10,document:11,stop:12,statement:13,link:14,LINK:15,START_LINK:16,LINK_LABEL:17,STR:18,nodeStatement:19,columnsStatement:20,SPACE_BLOCK:21,blockStatement:22,classDefStatement:23,cssClassStatement:24,styleStatement:25,node:26,SIZE:27,COLUMNS:28,"id-block":29,end:30,block:31,NODE_ID:32,nodeShapeNLabel:33,dirList:34,DIR:35,NODE_DSTART:36,NODE_DEND:37,BLOCK_ARROW_START:38,BLOCK_ARROW_END:39,classDef:40,CLASSDEF_ID:41,CLASSDEF_STYLEOPTS:42,DEFAULT:43,class:44,CLASSENTITY_IDS:45,STYLECLASS:46,style:47,STYLE_ENTITY_IDS:48,STYLE_DEFINITION_DATA:49,$accept:0,$end:1},terminals_:{2:"error",4:"SPACELINE",5:"NL",7:"SPACE",8:"EOF",10:"BLOCK_DIAGRAM_KEY",15:"LINK",16:"START_LINK",17:"LINK_LABEL",18:"STR",21:"SPACE_BLOCK",27:"SIZE",28:"COLUMNS",29:"id-block",30:"end",31:"block",32:"NODE_ID",35:"DIR",36:"NODE_DSTART",37:"NODE_DEND",38:"BLOCK_ARROW_START",39:"BLOCK_ARROW_END",40:"classDef",41:"CLASSDEF_ID",42:"CLASSDEF_STYLEOPTS",43:"DEFAULT",44:"class",45:"CLASSENTITY_IDS",46:"STYLECLASS",47:"style",48:"STYLE_ENTITY_IDS",49:"STYLE_DEFINITION_DATA"},productions_:[0,[3,1],[3,2],[3,2],[6,1],[6,1],[6,1],[9,3],[12,1],[12,1],[12,2],[12,2],[11,1],[11,2],[14,1],[14,4],[13,1],[13,1],[13,1],[13,1],[13,1],[13,1],[13,1],[19,3],[19,2],[19,1],[20,1],[22,4],[22,3],[26,1],[26,2],[34,1],[34,2],[33,3],[33,4],[23,3],[23,3],[24,3],[25,3]],performAction:function(o,s,i,u,h,t,m){var r=t.length-1;switch(h){case 4:u.getLogger().debug("Rule: separator (NL) ");break;case 5:u.getLogger().debug("Rule: separator (Space) ");break;case 6:u.getLogger().debug("Rule: separator (EOF) ");break;case 7:u.getLogger().debug("Rule: hierarchy: ",t[r-1]),u.setHierarchy(t[r-1]);break;case 8:u.getLogger().debug("Stop NL ");break;case 9:u.getLogger().debug("Stop EOF ");break;case 10:u.getLogger().debug("Stop NL2 ");break;case 11:u.getLogger().debug("Stop EOF2 ");break;case 12:u.getLogger().debug("Rule: statement: ",t[r]),typeof t[r].length=="number"?this.$=t[r]:this.$=[t[r]];break;case 13:u.getLogger().debug("Rule: statement #2: ",t[r-1]),this.$=[t[r-1]].concat(t[r]);break;case 14:u.getLogger().debug("Rule: link: ",t[r],o),this.$={edgeTypeStr:t[r],label:""};break;case 15:u.getLogger().debug("Rule: LABEL link: ",t[r-3],t[r-1],t[r]),this.$={edgeTypeStr:t[r],label:t[r-1]};break;case 18:const R=parseInt(t[r]),Y=u.generateId();this.$={id:Y,type:"space",label:"",width:R,children:[]};break;case 23:u.getLogger().debug("Rule: (nodeStatement link node) ",t[r-2],t[r-1],t[r]," typestr: ",t[r-1].edgeTypeStr);const F=u.edgeStrToEdgeData(t[r-1].edgeTypeStr);this.$=[{id:t[r-2].id,label:t[r-2].label,type:t[r-2].type,directions:t[r-2].directions},{id:t[r-2].id+"-"+t[r].id,start:t[r-2].id,end:t[r].id,label:t[r-1].label,type:"edge",directions:t[r].directions,arrowTypeEnd:F,arrowTypeStart:"arrow_open"},{id:t[r].id,label:t[r].label,type:u.typeStr2Type(t[r].typeStr),directions:t[r].directions}];break;case 24:u.getLogger().debug("Rule: nodeStatement (abc88 node size) ",t[r-1],t[r]),this.$={id:t[r-1].id,label:t[r-1].label,type:u.typeStr2Type(t[r-1].typeStr),directions:t[r-1].directions,widthInColumns:parseInt(t[r],10)};break;case 25:u.getLogger().debug("Rule: nodeStatement (node) ",t[r]),this.$={id:t[r].id,label:t[r].label,type:u.typeStr2Type(t[r].typeStr),directions:t[r].directions,widthInColumns:1};break;case 26:u.getLogger().debug("APA123",this?this:"na"),u.getLogger().debug("COLUMNS: ",t[r]),this.$={type:"column-setting",columns:t[r]==="auto"?-1:parseInt(t[r])};break;case 27:u.getLogger().debug("Rule: id-block statement : ",t[r-2],t[r-1]),u.generateId(),this.$={...t[r-2],type:"composite",children:t[r-1]};break;case 28:u.getLogger().debug("Rule: blockStatement : ",t[r-2],t[r-1],t[r]);const C=u.generateId();this.$={id:C,type:"composite",label:"",children:t[r-1]};break;case 29:u.getLogger().debug("Rule: node (NODE_ID separator): ",t[r]),this.$={id:t[r]};break;case 30:u.getLogger().debug("Rule: node (NODE_ID nodeShapeNLabel separator): ",t[r-1],t[r]),this.$={id:t[r-1],label:t[r].label,typeStr:t[r].typeStr,directions:t[r].directions};break;case 31:u.getLogger().debug("Rule: dirList: ",t[r]),this.$=[t[r]];break;case 32:u.getLogger().debug("Rule: dirList: ",t[r-1],t[r]),this.$=[t[r-1]].concat(t[r]);break;case 33:u.getLogger().debug("Rule: nodeShapeNLabel: ",t[r-2],t[r-1],t[r]),this.$={typeStr:t[r-2]+t[r],label:t[r-1]};break;case 34:u.getLogger().debug("Rule: BLOCK_ARROW nodeShapeNLabel: ",t[r-3],t[r-2]," #3:",t[r-1],t[r]),this.$={typeStr:t[r-3]+t[r],label:t[r-2],directions:t[r-1]};break;case 35:case 36:this.$={type:"classDef",id:t[r-1].trim(),css:t[r].trim()};break;case 37:this.$={type:"applyClass",id:t[r-1].trim(),styleClass:t[r].trim()};break;case 38:this.$={type:"applyStyles",id:t[r-1].trim(),stylesStr:t[r].trim()};break}},table:[{9:1,10:[1,2]},{1:[3]},{11:3,13:4,19:5,20:6,21:a,22:8,23:9,24:10,25:11,26:12,28:d,29:c,31:n,32:g,40:l,44:f,47:b},{8:[1,20]},e(p,[2,12],{13:4,19:5,20:6,22:8,23:9,24:10,25:11,26:12,11:21,21:a,28:d,29:c,31:n,32:g,40:l,44:f,47:b}),e(x,[2,16],{14:22,15:y,16:T}),e(x,[2,17]),e(x,[2,18]),e(x,[2,19]),e(x,[2,20]),e(x,[2,21]),e(x,[2,22]),e(v,[2,25],{27:[1,25]}),e(x,[2,26]),{19:26,26:12,32:g},{11:27,13:4,19:5,20:6,21:a,22:8,23:9,24:10,25:11,26:12,28:d,29:c,31:n,32:g,40:l,44:f,47:b},{41:[1,28],43:[1,29]},{45:[1,30]},{48:[1,31]},e(N,[2,29],{33:32,36:[1,33],38:[1,34]}),{1:[2,7]},e(p,[2,13]),{26:35,32:g},{32:[2,14]},{17:[1,36]},e(v,[2,24]),{11:37,13:4,14:22,15:y,16:T,19:5,20:6,21:a,22:8,23:9,24:10,25:11,26:12,28:d,29:c,31:n,32:g,40:l,44:f,47:b},{30:[1,38]},{42:[1,39]},{42:[1,40]},{46:[1,41]},{49:[1,42]},e(N,[2,30]),{18:[1,43]},{18:[1,44]},e(v,[2,23]),{18:[1,45]},{30:[1,46]},e(x,[2,28]),e(x,[2,35]),e(x,[2,36]),e(x,[2,37]),e(x,[2,38]),{37:[1,47]},{34:48,35:E},{15:[1,50]},e(x,[2,27]),e(N,[2,33]),{39:[1,51]},{34:52,35:E,39:[2,31]},{32:[2,15]},e(N,[2,34]),{39:[2,32]}],defaultActions:{20:[2,7],23:[2,14],50:[2,15],52:[2,32]},parseError:function(o,s){if(s.recoverable)this.trace(o);else{var i=new Error(o);throw i.hash=s,i}},parse:function(o){var s=this,i=[0],u=[],h=[null],t=[],m=this.table,r="",R=0,Y=0,F=2,C=1,Le=t.slice.call(arguments,1),w=Object.create(this.lexer),K={yy:{}};for(var Z in this.yy)Object.prototype.hasOwnProperty.call(this.yy,Z)&&(K.yy[Z]=this.yy[Z]);w.setInput(o,K.yy),K.yy.lexer=w,K.yy.parser=this,typeof w.yylloc>"u"&&(w.yylloc={});var J=w.yylloc;t.push(J);var me=w.options&&w.options.ranges;typeof K.yy.parseError=="function"?this.parseError=K.yy.parseError:this.parseError=Object.getPrototypeOf(this).parseError;function _e(){var P;return P=u.pop()||w.lex()||C,typeof P!="number"&&(P instanceof Array&&(u=P,P=u.pop()),P=s.symbols_[P]||P),P}for(var I,M,z,Q,W={},X,B,ae,G;;){if(M=i[i.length-1],this.defaultActions[M]?z=this.defaultActions[M]:((I===null||typeof I>"u")&&(I=_e()),z=m[M]&&m[M][I]),typeof z>"u"||!z.length||!z[0]){var $="";G=[];for(X in m[M])this.terminals_[X]&&X>F&&G.push("'"+this.terminals_[X]+"'");w.showPosition?$="Parse error on line "+(R+1)+`:
|
| 2 |
+
`+w.showPosition()+`
|
| 3 |
+
Expecting `+G.join(", ")+", got '"+(this.terminals_[I]||I)+"'":$="Parse error on line "+(R+1)+": Unexpected "+(I==C?"end of input":"'"+(this.terminals_[I]||I)+"'"),this.parseError($,{text:w.match,token:this.terminals_[I]||I,line:w.yylineno,loc:J,expected:G})}if(z[0]instanceof Array&&z.length>1)throw new Error("Parse Error: multiple actions possible at state: "+M+", token: "+I);switch(z[0]){case 1:i.push(I),h.push(w.yytext),t.push(w.yylloc),i.push(z[1]),I=null,Y=w.yyleng,r=w.yytext,R=w.yylineno,J=w.yylloc;break;case 2:if(B=this.productions_[z[1]][1],W.$=h[h.length-B],W._$={first_line:t[t.length-(B||1)].first_line,last_line:t[t.length-1].last_line,first_column:t[t.length-(B||1)].first_column,last_column:t[t.length-1].last_column},me&&(W._$.range=[t[t.length-(B||1)].range[0],t[t.length-1].range[1]]),Q=this.performAction.apply(W,[r,Y,R,K.yy,z[1],h,t].concat(Le)),typeof Q<"u")return Q;B&&(i=i.slice(0,-1*B*2),h=h.slice(0,-1*B),t=t.slice(0,-1*B)),i.push(this.productions_[z[1]][0]),h.push(W.$),t.push(W._$),ae=m[i[i.length-2]][i[i.length-1]],i.push(ae);break;case 3:return!0}}return!0}},A=function(){var D={EOF:1,parseError:function(s,i){if(this.yy.parser)this.yy.parser.parseError(s,i);else throw new Error(s)},setInput:function(o,s){return this.yy=s||this.yy||{},this._input=o,this._more=this._backtrack=this.done=!1,this.yylineno=this.yyleng=0,this.yytext=this.matched=this.match="",this.conditionStack=["INITIAL"],this.yylloc={first_line:1,first_column:0,last_line:1,last_column:0},this.options.ranges&&(this.yylloc.range=[0,0]),this.offset=0,this},input:function(){var o=this._input[0];this.yytext+=o,this.yyleng++,this.offset++,this.match+=o,this.matched+=o;var s=o.match(/(?:\r\n?|\n).*/g);return s?(this.yylineno++,this.yylloc.last_line++):this.yylloc.last_column++,this.options.ranges&&this.yylloc.range[1]++,this._input=this._input.slice(1),o},unput:function(o){var s=o.length,i=o.split(/(?:\r\n?|\n)/g);this._input=o+this._input,this.yytext=this.yytext.substr(0,this.yytext.length-s),this.offset-=s;var u=this.match.split(/(?:\r\n?|\n)/g);this.match=this.match.substr(0,this.match.length-1),this.matched=this.matched.substr(0,this.matched.length-1),i.length-1&&(this.yylineno-=i.length-1);var h=this.yylloc.range;return this.yylloc={first_line:this.yylloc.first_line,last_line:this.yylineno+1,first_column:this.yylloc.first_column,last_column:i?(i.length===u.length?this.yylloc.first_column:0)+u[u.length-i.length].length-i[0].length:this.yylloc.first_column-s},this.options.ranges&&(this.yylloc.range=[h[0],h[0]+this.yyleng-s]),this.yyleng=this.yytext.length,this},more:function(){return this._more=!0,this},reject:function(){if(this.options.backtrack_lexer)this._backtrack=!0;else return this.parseError("Lexical error on line "+(this.yylineno+1)+`. You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).
|
| 4 |
+
`+this.showPosition(),{text:"",token:null,line:this.yylineno});return this},less:function(o){this.unput(this.match.slice(o))},pastInput:function(){var o=this.matched.substr(0,this.matched.length-this.match.length);return(o.length>20?"...":"")+o.substr(-20).replace(/\n/g,"")},upcomingInput:function(){var o=this.match;return o.length<20&&(o+=this._input.substr(0,20-o.length)),(o.substr(0,20)+(o.length>20?"...":"")).replace(/\n/g,"")},showPosition:function(){var o=this.pastInput(),s=new Array(o.length+1).join("-");return o+this.upcomingInput()+`
|
| 5 |
+
`+s+"^"},test_match:function(o,s){var i,u,h;if(this.options.backtrack_lexer&&(h={yylineno:this.yylineno,yylloc:{first_line:this.yylloc.first_line,last_line:this.last_line,first_column:this.yylloc.first_column,last_column:this.yylloc.last_column},yytext:this.yytext,match:this.match,matches:this.matches,matched:this.matched,yyleng:this.yyleng,offset:this.offset,_more:this._more,_input:this._input,yy:this.yy,conditionStack:this.conditionStack.slice(0),done:this.done},this.options.ranges&&(h.yylloc.range=this.yylloc.range.slice(0))),u=o[0].match(/(?:\r\n?|\n).*/g),u&&(this.yylineno+=u.length),this.yylloc={first_line:this.yylloc.last_line,last_line:this.yylineno+1,first_column:this.yylloc.last_column,last_column:u?u[u.length-1].length-u[u.length-1].match(/\r?\n?/)[0].length:this.yylloc.last_column+o[0].length},this.yytext+=o[0],this.match+=o[0],this.matches=o,this.yyleng=this.yytext.length,this.options.ranges&&(this.yylloc.range=[this.offset,this.offset+=this.yyleng]),this._more=!1,this._backtrack=!1,this._input=this._input.slice(o[0].length),this.matched+=o[0],i=this.performAction.call(this,this.yy,this,s,this.conditionStack[this.conditionStack.length-1]),this.done&&this._input&&(this.done=!1),i)return i;if(this._backtrack){for(var t in h)this[t]=h[t];return!1}return!1},next:function(){if(this.done)return this.EOF;this._input||(this.done=!0);var o,s,i,u;this._more||(this.yytext="",this.match="");for(var h=this._currentRules(),t=0;t<h.length;t++)if(i=this._input.match(this.rules[h[t]]),i&&(!s||i[0].length>s[0].length)){if(s=i,u=t,this.options.backtrack_lexer){if(o=this.test_match(i,h[t]),o!==!1)return o;if(this._backtrack){s=!1;continue}else return!1}else if(!this.options.flex)break}return s?(o=this.test_match(s,h[u]),o!==!1?o:!1):this._input===""?this.EOF:this.parseError("Lexical error on line "+(this.yylineno+1)+`. Unrecognized text.
|
| 6 |
+
`+this.showPosition(),{text:"",token:null,line:this.yylineno})},lex:function(){var s=this.next();return s||this.lex()},begin:function(s){this.conditionStack.push(s)},popState:function(){var s=this.conditionStack.length-1;return s>0?this.conditionStack.pop():this.conditionStack[0]},_currentRules:function(){return this.conditionStack.length&&this.conditionStack[this.conditionStack.length-1]?this.conditions[this.conditionStack[this.conditionStack.length-1]].rules:this.conditions.INITIAL.rules},topState:function(s){return s=this.conditionStack.length-1-Math.abs(s||0),s>=0?this.conditionStack[s]:"INITIAL"},pushState:function(s){this.begin(s)},stateStackSize:function(){return this.conditionStack.length},options:{},performAction:function(s,i,u,h){switch(u){case 0:return 10;case 1:return s.getLogger().debug("Found space-block"),31;case 2:return s.getLogger().debug("Found nl-block"),31;case 3:return s.getLogger().debug("Found space-block"),29;case 4:s.getLogger().debug(".",i.yytext);break;case 5:s.getLogger().debug("_",i.yytext);break;case 6:return 5;case 7:return i.yytext=-1,28;case 8:return i.yytext=i.yytext.replace(/columns\s+/,""),s.getLogger().debug("COLUMNS (LEX)",i.yytext),28;case 9:this.pushState("md_string");break;case 10:return"MD_STR";case 11:this.popState();break;case 12:this.pushState("string");break;case 13:s.getLogger().debug("LEX: POPPING STR:",i.yytext),this.popState();break;case 14:return s.getLogger().debug("LEX: STR end:",i.yytext),"STR";case 15:return i.yytext=i.yytext.replace(/space\:/,""),s.getLogger().debug("SPACE NUM (LEX)",i.yytext),21;case 16:return i.yytext="1",s.getLogger().debug("COLUMNS (LEX)",i.yytext),21;case 17:return 43;case 18:return"LINKSTYLE";case 19:return"INTERPOLATE";case 20:return this.pushState("CLASSDEF"),40;case 21:return this.popState(),this.pushState("CLASSDEFID"),"DEFAULT_CLASSDEF_ID";case 22:return this.popState(),this.pushState("CLASSDEFID"),41;case 23:return this.popState(),42;case 24:return this.pushState("CLASS"),44;case 25:return this.popState(),this.pushState("CLASS_STYLE"),45;case 26:return this.popState(),46;case 27:return this.pushState("STYLE_STMNT"),47;case 28:return this.popState(),this.pushState("STYLE_DEFINITION"),48;case 29:return this.popState(),49;case 30:return this.pushState("acc_title"),"acc_title";case 31:return this.popState(),"acc_title_value";case 32:return this.pushState("acc_descr"),"acc_descr";case 33:return this.popState(),"acc_descr_value";case 34:this.pushState("acc_descr_multiline");break;case 35:this.popState();break;case 36:return"acc_descr_multiline_value";case 37:return 30;case 38:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 39:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 40:return this.popState(),s.getLogger().debug("Lex: ))"),"NODE_DEND";case 41:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 42:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 43:return this.popState(),s.getLogger().debug("Lex: (-"),"NODE_DEND";case 44:return this.popState(),s.getLogger().debug("Lex: -)"),"NODE_DEND";case 45:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 46:return this.popState(),s.getLogger().debug("Lex: ]]"),"NODE_DEND";case 47:return this.popState(),s.getLogger().debug("Lex: ("),"NODE_DEND";case 48:return this.popState(),s.getLogger().debug("Lex: ])"),"NODE_DEND";case 49:return this.popState(),s.getLogger().debug("Lex: /]"),"NODE_DEND";case 50:return this.popState(),s.getLogger().debug("Lex: /]"),"NODE_DEND";case 51:return this.popState(),s.getLogger().debug("Lex: )]"),"NODE_DEND";case 52:return this.popState(),s.getLogger().debug("Lex: )"),"NODE_DEND";case 53:return this.popState(),s.getLogger().debug("Lex: ]>"),"NODE_DEND";case 54:return this.popState(),s.getLogger().debug("Lex: ]"),"NODE_DEND";case 55:return s.getLogger().debug("Lexa: -)"),this.pushState("NODE"),36;case 56:return s.getLogger().debug("Lexa: (-"),this.pushState("NODE"),36;case 57:return s.getLogger().debug("Lexa: ))"),this.pushState("NODE"),36;case 58:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 59:return s.getLogger().debug("Lex: ((("),this.pushState("NODE"),36;case 60:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 61:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 62:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 63:return s.getLogger().debug("Lexc: >"),this.pushState("NODE"),36;case 64:return s.getLogger().debug("Lexa: (["),this.pushState("NODE"),36;case 65:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 66:return this.pushState("NODE"),36;case 67:return this.pushState("NODE"),36;case 68:return this.pushState("NODE"),36;case 69:return this.pushState("NODE"),36;case 70:return this.pushState("NODE"),36;case 71:return this.pushState("NODE"),36;case 72:return this.pushState("NODE"),36;case 73:return s.getLogger().debug("Lexa: ["),this.pushState("NODE"),36;case 74:return this.pushState("BLOCK_ARROW"),s.getLogger().debug("LEX ARR START"),38;case 75:return s.getLogger().debug("Lex: NODE_ID",i.yytext),32;case 76:return s.getLogger().debug("Lex: EOF",i.yytext),8;case 77:this.pushState("md_string");break;case 78:this.pushState("md_string");break;case 79:return"NODE_DESCR";case 80:this.popState();break;case 81:s.getLogger().debug("Lex: Starting string"),this.pushState("string");break;case 82:s.getLogger().debug("LEX ARR: Starting string"),this.pushState("string");break;case 83:return s.getLogger().debug("LEX: NODE_DESCR:",i.yytext),"NODE_DESCR";case 84:s.getLogger().debug("LEX POPPING"),this.popState();break;case 85:s.getLogger().debug("Lex: =>BAE"),this.pushState("ARROW_DIR");break;case 86:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (right): dir:",i.yytext),"DIR";case 87:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (left):",i.yytext),"DIR";case 88:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (x):",i.yytext),"DIR";case 89:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (y):",i.yytext),"DIR";case 90:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (up):",i.yytext),"DIR";case 91:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (down):",i.yytext),"DIR";case 92:return i.yytext="]>",s.getLogger().debug("Lex (ARROW_DIR end):",i.yytext),this.popState(),this.popState(),"BLOCK_ARROW_END";case 93:return s.getLogger().debug("Lex: LINK","#"+i.yytext+"#"),15;case 94:return s.getLogger().debug("Lex: LINK",i.yytext),15;case 95:return s.getLogger().debug("Lex: LINK",i.yytext),15;case 96:return s.getLogger().debug("Lex: LINK",i.yytext),15;case 97:return s.getLogger().debug("Lex: START_LINK",i.yytext),this.pushState("LLABEL"),16;case 98:return s.getLogger().debug("Lex: START_LINK",i.yytext),this.pushState("LLABEL"),16;case 99:return s.getLogger().debug("Lex: START_LINK",i.yytext),this.pushState("LLABEL"),16;case 100:this.pushState("md_string");break;case 101:return s.getLogger().debug("Lex: Starting string"),this.pushState("string"),"LINK_LABEL";case 102:return this.popState(),s.getLogger().debug("Lex: LINK","#"+i.yytext+"#"),15;case 103:return this.popState(),s.getLogger().debug("Lex: LINK",i.yytext),15;case 104:return this.popState(),s.getLogger().debug("Lex: LINK",i.yytext),15;case 105:return s.getLogger().debug("Lex: COLON",i.yytext),i.yytext=i.yytext.slice(1),27}},rules:[/^(?:block-beta\b)/,/^(?:block\s+)/,/^(?:block\n+)/,/^(?:block:)/,/^(?:[\s]+)/,/^(?:[\n]+)/,/^(?:((\u000D\u000A)|(\u000A)))/,/^(?:columns\s+auto\b)/,/^(?:columns\s+[\d]+)/,/^(?:["][`])/,/^(?:[^`"]+)/,/^(?:[`]["])/,/^(?:["])/,/^(?:["])/,/^(?:[^"]*)/,/^(?:space[:]\d+)/,/^(?:space\b)/,/^(?:default\b)/,/^(?:linkStyle\b)/,/^(?:interpolate\b)/,/^(?:classDef\s+)/,/^(?:DEFAULT\s+)/,/^(?:\w+\s+)/,/^(?:[^\n]*)/,/^(?:class\s+)/,/^(?:(\w+)+((,\s*\w+)*))/,/^(?:[^\n]*)/,/^(?:style\s+)/,/^(?:(\w+)+((,\s*\w+)*))/,/^(?:[^\n]*)/,/^(?:accTitle\s*:\s*)/,/^(?:(?!\n||)*[^\n]*)/,/^(?:accDescr\s*:\s*)/,/^(?:(?!\n||)*[^\n]*)/,/^(?:accDescr\s*\{\s*)/,/^(?:[\}])/,/^(?:[^\}]*)/,/^(?:end\b\s*)/,/^(?:\(\(\()/,/^(?:\)\)\))/,/^(?:[\)]\))/,/^(?:\}\})/,/^(?:\})/,/^(?:\(-)/,/^(?:-\))/,/^(?:\(\()/,/^(?:\]\])/,/^(?:\()/,/^(?:\]\))/,/^(?:\\\])/,/^(?:\/\])/,/^(?:\)\])/,/^(?:[\)])/,/^(?:\]>)/,/^(?:[\]])/,/^(?:-\))/,/^(?:\(-)/,/^(?:\)\))/,/^(?:\))/,/^(?:\(\(\()/,/^(?:\(\()/,/^(?:\{\{)/,/^(?:\{)/,/^(?:>)/,/^(?:\(\[)/,/^(?:\()/,/^(?:\[\[)/,/^(?:\[\|)/,/^(?:\[\()/,/^(?:\)\)\))/,/^(?:\[\\)/,/^(?:\[\/)/,/^(?:\[\\)/,/^(?:\[)/,/^(?:<\[)/,/^(?:[^\(\[\n\-\)\{\}\s\<\>:]+)/,/^(?:$)/,/^(?:["][`])/,/^(?:["][`])/,/^(?:[^`"]+)/,/^(?:[`]["])/,/^(?:["])/,/^(?:["])/,/^(?:[^"]+)/,/^(?:["])/,/^(?:\]>\s*\()/,/^(?:,?\s*right\s*)/,/^(?:,?\s*left\s*)/,/^(?:,?\s*x\s*)/,/^(?:,?\s*y\s*)/,/^(?:,?\s*up\s*)/,/^(?:,?\s*down\s*)/,/^(?:\)\s*)/,/^(?:\s*[xo<]?--+[-xo>]\s*)/,/^(?:\s*[xo<]?==+[=xo>]\s*)/,/^(?:\s*[xo<]?-?\.+-[xo>]?\s*)/,/^(?:\s*~~[\~]+\s*)/,/^(?:\s*[xo<]?--\s*)/,/^(?:\s*[xo<]?==\s*)/,/^(?:\s*[xo<]?-\.\s*)/,/^(?:["][`])/,/^(?:["])/,/^(?:\s*[xo<]?--+[-xo>]\s*)/,/^(?:\s*[xo<]?==+[=xo>]\s*)/,/^(?:\s*[xo<]?-?\.+-[xo>]?\s*)/,/^(?::\d+)/],conditions:{STYLE_DEFINITION:{rules:[29],inclusive:!1},STYLE_STMNT:{rules:[28],inclusive:!1},CLASSDEFID:{rules:[23],inclusive:!1},CLASSDEF:{rules:[21,22],inclusive:!1},CLASS_STYLE:{rules:[26],inclusive:!1},CLASS:{rules:[25],inclusive:!1},LLABEL:{rules:[100,101,102,103,104],inclusive:!1},ARROW_DIR:{rules:[86,87,88,89,90,91,92],inclusive:!1},BLOCK_ARROW:{rules:[77,82,85],inclusive:!1},NODE:{rules:[38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,78,81],inclusive:!1},md_string:{rules:[10,11,79,80],inclusive:!1},space:{rules:[],inclusive:!1},string:{rules:[13,14,83,84],inclusive:!1},acc_descr_multiline:{rules:[35,36],inclusive:!1},acc_descr:{rules:[33],inclusive:!1},acc_title:{rules:[31],inclusive:!1},INITIAL:{rules:[0,1,2,3,4,5,6,7,8,9,12,15,16,17,18,19,20,24,27,30,32,34,37,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,93,94,95,96,97,98,99,105],inclusive:!0}}};return D}();L.lexer=A;function k(){this.yy={}}return k.prototype=L,L.Parser=k,new k}();ee.parser=ee;const Pe=ee;let O={},ie=[],V={};const ce="color",ue="fill",Fe="bgFill",pe=",",Ke=he();let j={};const Me=e=>De.sanitizeText(e,Ke),Ye=function(e,a=""){j[e]===void 0&&(j[e]={id:e,styles:[],textStyles:[]});const d=j[e];a!=null&&a.split(pe).forEach(c=>{const n=c.replace(/([^;]*);/,"$1").trim();if(c.match(ce)){const l=n.replace(ue,Fe).replace(ce,ue);d.textStyles.push(l)}d.styles.push(n)})},We=function(e,a=""){const d=O[e];a!=null&&(d.styles=a.split(pe))},Ve=function(e,a){e.split(",").forEach(function(d){let c=O[d];if(c===void 0){const n=d.trim();O[n]={id:n,type:"na",children:[]},c=O[n]}c.classes||(c.classes=[]),c.classes.push(a)})},fe=(e,a)=>{const d=e.flat(),c=[];for(const n of d){if(n.label&&(n.label=Me(n.label)),n.type==="classDef"){Ye(n.id,n.css);continue}if(n.type==="applyClass"){Ve(n.id,(n==null?void 0:n.styleClass)||"");continue}if(n.type==="applyStyles"){n!=null&&n.stylesStr&&We(n.id,n==null?void 0:n.stylesStr);continue}if(n.type==="column-setting")a.columns=n.columns||-1;else if(n.type==="edge")V[n.id]?V[n.id]++:V[n.id]=1,n.id=V[n.id]+"-"+n.id,ie.push(n);else{n.label||(n.type==="composite"?n.label="":n.label=n.id);const g=!O[n.id];if(g?O[n.id]=n:(n.type!=="na"&&(O[n.id].type=n.type),n.label!==n.id&&(O[n.id].label=n.label)),n.children&&fe(n.children,n),n.type==="space"){const l=n.width||1;for(let f=0;f<l;f++){const b=Ne(n);b.id=b.id+"-"+f,O[b.id]=b,c.push(b)}}else g&&c.push(n)}}a.children=c};let re=[],U={id:"root",type:"composite",children:[],columns:-1};const je=()=>{S.debug("Clear called"),Ee(),U={id:"root",type:"composite",children:[],columns:-1},O={root:U},re=[],j={},ie=[],V={}};function Ue(e){switch(S.debug("typeStr2Type",e),e){case"[]":return"square";case"()":return S.debug("we have a round"),"round";case"(())":return"circle";case">]":return"rect_left_inv_arrow";case"{}":return"diamond";case"{{}}":return"hexagon";case"([])":return"stadium";case"[[]]":return"subroutine";case"[()]":return"cylinder";case"((()))":return"doublecircle";case"[//]":return"lean_right";case"[\\\\]":return"lean_left";case"[/\\]":return"trapezoid";case"[\\/]":return"inv_trapezoid";case"<[]>":return"block_arrow";default:return"na"}}function Xe(e){switch(S.debug("typeStr2Type",e),e){case"==":return"thick";default:return"normal"}}function Ge(e){switch(e.trim()){case"--x":return"arrow_cross";case"--o":return"arrow_circle";default:return"arrow_point"}}let de=0;const He=()=>(de++,"id-"+Math.random().toString(36).substr(2,12)+"-"+de),qe=e=>{U.children=e,fe(e,U),re=U.children},Ze=e=>{const a=O[e];return a?a.columns?a.columns:a.children?a.children.length:-1:-1},Je=()=>[...Object.values(O)],Qe=()=>re||[],$e=()=>ie,et=e=>O[e],tt=e=>{O[e.id]=e},st=()=>console,it=function(){return j},rt={getConfig:()=>se().block,typeStr2Type:Ue,edgeTypeStr2Type:Xe,edgeStrToEdgeData:Ge,getLogger:st,getBlocksFlat:Je,getBlocks:Qe,getEdges:$e,setHierarchy:qe,getBlock:et,setBlock:tt,getColumns:Ze,getClasses:it,clear:je,generateId:He},nt=rt,q=(e,a)=>{const d=Re,c=d(e,"r"),n=d(e,"g"),g=d(e,"b");return we(c,n,g,a)},at=e=>`.label {
|
| 7 |
+
font-family: ${e.fontFamily};
|
| 8 |
+
color: ${e.nodeTextColor||e.textColor};
|
| 9 |
+
}
|
| 10 |
+
.cluster-label text {
|
| 11 |
+
fill: ${e.titleColor};
|
| 12 |
+
}
|
| 13 |
+
.cluster-label span,p {
|
| 14 |
+
color: ${e.titleColor};
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
.label text,span,p {
|
| 20 |
+
fill: ${e.nodeTextColor||e.textColor};
|
| 21 |
+
color: ${e.nodeTextColor||e.textColor};
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
.node rect,
|
| 25 |
+
.node circle,
|
| 26 |
+
.node ellipse,
|
| 27 |
+
.node polygon,
|
| 28 |
+
.node path {
|
| 29 |
+
fill: ${e.mainBkg};
|
| 30 |
+
stroke: ${e.nodeBorder};
|
| 31 |
+
stroke-width: 1px;
|
| 32 |
+
}
|
| 33 |
+
.flowchart-label text {
|
| 34 |
+
text-anchor: middle;
|
| 35 |
+
}
|
| 36 |
+
// .flowchart-label .text-outer-tspan {
|
| 37 |
+
// text-anchor: middle;
|
| 38 |
+
// }
|
| 39 |
+
// .flowchart-label .text-inner-tspan {
|
| 40 |
+
// text-anchor: start;
|
| 41 |
+
// }
|
| 42 |
+
|
| 43 |
+
.node .label {
|
| 44 |
+
text-align: center;
|
| 45 |
+
}
|
| 46 |
+
.node.clickable {
|
| 47 |
+
cursor: pointer;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
.arrowheadPath {
|
| 51 |
+
fill: ${e.arrowheadColor};
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.edgePath .path {
|
| 55 |
+
stroke: ${e.lineColor};
|
| 56 |
+
stroke-width: 2.0px;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
.flowchart-link {
|
| 60 |
+
stroke: ${e.lineColor};
|
| 61 |
+
fill: none;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.edgeLabel {
|
| 65 |
+
background-color: ${e.edgeLabelBackground};
|
| 66 |
+
rect {
|
| 67 |
+
opacity: 0.5;
|
| 68 |
+
background-color: ${e.edgeLabelBackground};
|
| 69 |
+
fill: ${e.edgeLabelBackground};
|
| 70 |
+
}
|
| 71 |
+
text-align: center;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
/* For html labels only */
|
| 75 |
+
.labelBkg {
|
| 76 |
+
background-color: ${q(e.edgeLabelBackground,.5)};
|
| 77 |
+
// background-color:
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.node .cluster {
|
| 81 |
+
// fill: ${q(e.mainBkg,.5)};
|
| 82 |
+
fill: ${q(e.clusterBkg,.5)};
|
| 83 |
+
stroke: ${q(e.clusterBorder,.2)};
|
| 84 |
+
box-shadow: rgba(50, 50, 93, 0.25) 0px 13px 27px -5px, rgba(0, 0, 0, 0.3) 0px 8px 16px -8px;
|
| 85 |
+
stroke-width: 1px;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
.cluster text {
|
| 89 |
+
fill: ${e.titleColor};
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
.cluster span,p {
|
| 93 |
+
color: ${e.titleColor};
|
| 94 |
+
}
|
| 95 |
+
/* .cluster div {
|
| 96 |
+
color: ${e.titleColor};
|
| 97 |
+
} */
|
| 98 |
+
|
| 99 |
+
div.mermaidTooltip {
|
| 100 |
+
position: absolute;
|
| 101 |
+
text-align: center;
|
| 102 |
+
max-width: 200px;
|
| 103 |
+
padding: 2px;
|
| 104 |
+
font-family: ${e.fontFamily};
|
| 105 |
+
font-size: 12px;
|
| 106 |
+
background: ${e.tertiaryColor};
|
| 107 |
+
border: 1px solid ${e.border2};
|
| 108 |
+
border-radius: 2px;
|
| 109 |
+
pointer-events: none;
|
| 110 |
+
z-index: 100;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.flowchartTitleText {
|
| 114 |
+
text-anchor: middle;
|
| 115 |
+
font-size: 18px;
|
| 116 |
+
fill: ${e.textColor};
|
| 117 |
+
}
|
| 118 |
+
`,lt=at;function be(e,a,d=!1){var c,n,g;const l=e;let f="default";(((c=l==null?void 0:l.classes)==null?void 0:c.length)||0)>0&&(f=((l==null?void 0:l.classes)||[]).join(" ")),f=f+" flowchart-label";let b=0,p="",x;switch(l.type){case"round":b=5,p="rect";break;case"composite":b=0,p="composite",x=0;break;case"square":p="rect";break;case"diamond":p="question";break;case"hexagon":p="hexagon";break;case"block_arrow":p="block_arrow";break;case"odd":p="rect_left_inv_arrow";break;case"lean_right":p="lean_right";break;case"lean_left":p="lean_left";break;case"trapezoid":p="trapezoid";break;case"inv_trapezoid":p="inv_trapezoid";break;case"rect_left_inv_arrow":p="rect_left_inv_arrow";break;case"circle":p="circle";break;case"ellipse":p="ellipse";break;case"stadium":p="stadium";break;case"subroutine":p="subroutine";break;case"cylinder":p="cylinder";break;case"group":p="rect";break;case"doublecircle":p="doublecircle";break;default:p="rect"}const y=ve((l==null?void 0:l.styles)||[]),T=l.label,v=l.size||{width:0,height:0,x:0,y:0};return{labelStyle:y.labelStyle,shape:p,labelText:T,rx:b,ry:b,class:f,style:y.style,id:l.id,directions:l.directions,width:v.width,height:v.height,x:v.x,y:v.y,positioned:d,intersect:void 0,type:l.type,padding:x??(((g=(n=se())==null?void 0:n.block)==null?void 0:g.padding)||0)}}async function ot(e,a,d){const c=be(a,d,!1);if(c.type==="group")return;const n=await ge(e,c),g=n.node().getBBox(),l=d.getBlock(c.id);l.size={width:g.width,height:g.height,x:0,y:0,node:n},d.setBlock(l),n.remove()}async function ct(e,a,d){const c=be(a,d,!0);d.getBlock(c.id).type!=="space"&&(await ge(e,c),a.intersect=c==null?void 0:c.intersect,ze(c))}async function ne(e,a,d,c){for(const n of a)await c(e,n,d),n.children&&await ne(e,n.children,d,c)}async function ut(e,a,d){await ne(e,a,d,ot)}async function dt(e,a,d){await ne(e,a,d,ct)}async function ht(e,a,d,c,n){const g=new Ce({multigraph:!0,compound:!0});g.setGraph({rankdir:"TB",nodesep:10,ranksep:10,marginx:8,marginy:8});for(const l of d)l.size&&g.setNode(l.id,{width:l.size.width,height:l.size.height,intersect:l.intersect});for(const l of a)if(l.start&&l.end){const f=c.getBlock(l.start),b=c.getBlock(l.end);if(f!=null&&f.size&&(b!=null&&b.size)){const p=f.size,x=b.size,y=[{x:p.x,y:p.y},{x:p.x+(x.x-p.x)/2,y:p.y+(x.y-p.y)/2},{x:x.x,y:x.y}];await Ie(e,{v:l.start,w:l.end,name:l.id},{...l,arrowTypeEnd:l.arrowTypeEnd,arrowTypeStart:l.arrowTypeStart,points:y,classes:"edge-thickness-normal edge-pattern-solid flowchart-link LS-a1 LE-b1"},void 0,"block",g,n),l.label&&(await Oe(e,{...l,label:l.label,labelStyle:"stroke: #333; stroke-width: 1.5px;fill:none;",arrowTypeEnd:l.arrowTypeEnd,arrowTypeStart:l.arrowTypeStart}),await Te({...l,x:y[1].x,y:y[1].y},{originalPath:y}))}}}const _=((oe=(le=he())==null?void 0:le.block)==null?void 0:oe.padding)||8;function gt(e,a){if(e===0||!Number.isInteger(e))throw new Error("Columns must be an integer !== 0.");if(a<0||!Number.isInteger(a))throw new Error("Position must be a non-negative integer."+a);if(e<0)return{px:a,py:0};if(e===1)return{px:0,py:a};const d=a%e,c=Math.floor(a/e);return{px:d,py:c}}const pt=e=>{let a=0,d=0;for(const c of e.children){const{width:n,height:g,x:l,y:f}=c.size||{width:0,height:0,x:0,y:0};S.debug("getMaxChildSize abc95 child:",c.id,"width:",n,"height:",g,"x:",l,"y:",f,c.type),c.type!=="space"&&(n>a&&(a=n/(e.widthInColumns||1)),g>d&&(d=g))}return{width:a,height:d}};function te(e,a,d=0,c=0){var n,g,l,f,b,p,x,y,T,v,N;S.debug("setBlockSizes abc95 (start)",e.id,(n=e==null?void 0:e.size)==null?void 0:n.x,"block width =",e==null?void 0:e.size,"sieblingWidth",d),(g=e==null?void 0:e.size)!=null&&g.width||(e.size={width:d,height:c,x:0,y:0});let E=0,L=0;if(((l=e.children)==null?void 0:l.length)>0){for(const h of e.children)te(h,a);const A=pt(e);E=A.width,L=A.height,S.debug("setBlockSizes abc95 maxWidth of",e.id,":s children is ",E,L);for(const h of e.children)h.size&&(S.debug(`abc95 Setting size of children of ${e.id} id=${h.id} ${E} ${L} ${h.size}`),h.size.width=E*(h.widthInColumns||1)+_*((h.widthInColumns||1)-1),h.size.height=L,h.size.x=0,h.size.y=0,S.debug(`abc95 updating size of ${e.id} children child:${h.id} maxWidth:${E} maxHeight:${L}`));for(const h of e.children)te(h,a,E,L);const k=e.columns||-1;let D=0;for(const h of e.children)D+=h.widthInColumns||1;let o=e.children.length;k>0&&k<D&&(o=k),e.widthInColumns;const s=Math.ceil(D/o);let i=o*(E+_)+_,u=s*(L+_)+_;if(i<d){S.debug(`Detected to small siebling: abc95 ${e.id} sieblingWidth ${d} sieblingHeight ${c} width ${i}`),i=d,u=c;const h=(d-o*_-_)/o,t=(c-s*_-_)/s;S.debug("Size indata abc88",e.id,"childWidth",h,"maxWidth",E),S.debug("Size indata abc88",e.id,"childHeight",t,"maxHeight",L),S.debug("Size indata abc88 xSize",o,"padding",_);for(const m of e.children)m.size&&(m.size.width=h,m.size.height=t,m.size.x=0,m.size.y=0)}if(S.debug(`abc95 (finale calc) ${e.id} xSize ${o} ySize ${s} columns ${k}${e.children.length} width=${Math.max(i,((f=e.size)==null?void 0:f.width)||0)}`),i<(((b=e==null?void 0:e.size)==null?void 0:b.width)||0)){i=((p=e==null?void 0:e.size)==null?void 0:p.width)||0;const h=k>0?Math.min(e.children.length,k):e.children.length;if(h>0){const t=(i-h*_-_)/h;S.debug("abc95 (growing to fit) width",e.id,i,(x=e.size)==null?void 0:x.width,t);for(const m of e.children)m.size&&(m.size.width=t)}}e.size={width:i,height:u,x:0,y:0}}S.debug("setBlockSizes abc94 (done)",e.id,(y=e==null?void 0:e.size)==null?void 0:y.x,(T=e==null?void 0:e.size)==null?void 0:T.width,(v=e==null?void 0:e.size)==null?void 0:v.y,(N=e==null?void 0:e.size)==null?void 0:N.height)}function xe(e,a){var d,c,n,g,l,f,b,p,x,y,T,v,N,E,L,A,k;S.debug(`abc85 layout blocks (=>layoutBlocks) ${e.id} x: ${(d=e==null?void 0:e.size)==null?void 0:d.x} y: ${(c=e==null?void 0:e.size)==null?void 0:c.y} width: ${(n=e==null?void 0:e.size)==null?void 0:n.width}`);const D=e.columns||-1;if(S.debug("layoutBlocks columns abc95",e.id,"=>",D,e),e.children&&e.children.length>0){const o=((l=(g=e==null?void 0:e.children[0])==null?void 0:g.size)==null?void 0:l.width)||0,s=e.children.length*o+(e.children.length-1)*_;S.debug("widthOfChildren 88",s,"posX");let i=0;S.debug("abc91 block?.size?.x",e.id,(f=e==null?void 0:e.size)==null?void 0:f.x);let u=(b=e==null?void 0:e.size)!=null&&b.x?((p=e==null?void 0:e.size)==null?void 0:p.x)+(-((x=e==null?void 0:e.size)==null?void 0:x.width)/2||0):-_,h=0;for(const t of e.children){const m=e;if(!t.size)continue;const{width:r,height:R}=t.size,{px:Y,py:F}=gt(D,i);if(F!=h&&(h=F,u=(y=e==null?void 0:e.size)!=null&&y.x?((T=e==null?void 0:e.size)==null?void 0:T.x)+(-((v=e==null?void 0:e.size)==null?void 0:v.width)/2||0):-_,S.debug("New row in layout for block",e.id," and child ",t.id,h)),S.debug(`abc89 layout blocks (child) id: ${t.id} Pos: ${i} (px, py) ${Y},${F} (${(N=m==null?void 0:m.size)==null?void 0:N.x},${(E=m==null?void 0:m.size)==null?void 0:E.y}) parent: ${m.id} width: ${r}${_}`),m.size){const C=r/2;t.size.x=u+_+C,S.debug(`abc91 layout blocks (calc) px, pyid:${t.id} startingPos=X${u} new startingPosX${t.size.x} ${C} padding=${_} width=${r} halfWidth=${C} => x:${t.size.x} y:${t.size.y} ${t.widthInColumns} (width * (child?.w || 1)) / 2 ${r*((t==null?void 0:t.widthInColumns)||1)/2}`),u=t.size.x+C,t.size.y=m.size.y-m.size.height/2+F*(R+_)+R/2+_,S.debug(`abc88 layout blocks (calc) px, pyid:${t.id}startingPosX${u}${_}${C}=>x:${t.size.x}y:${t.size.y}${t.widthInColumns}(width * (child?.w || 1)) / 2${r*((t==null?void 0:t.widthInColumns)||1)/2}`)}t.children&&xe(t),i+=(t==null?void 0:t.widthInColumns)||1,S.debug("abc88 columnsPos",t,i)}}S.debug(`layout blocks (<==layoutBlocks) ${e.id} x: ${(L=e==null?void 0:e.size)==null?void 0:L.x} y: ${(A=e==null?void 0:e.size)==null?void 0:A.y} width: ${(k=e==null?void 0:e.size)==null?void 0:k.width}`)}function Se(e,{minX:a,minY:d,maxX:c,maxY:n}={minX:0,minY:0,maxX:0,maxY:0}){if(e.size&&e.id!=="root"){const{x:g,y:l,width:f,height:b}=e.size;g-f/2<a&&(a=g-f/2),l-b/2<d&&(d=l-b/2),g+f/2>c&&(c=g+f/2),l+b/2>n&&(n=l+b/2)}if(e.children)for(const g of e.children)({minX:a,minY:d,maxX:c,maxY:n}=Se(g,{minX:a,minY:d,maxX:c,maxY:n}));return{minX:a,minY:d,maxX:c,maxY:n}}function ft(e){const a=e.getBlock("root");if(!a)return;te(a,e,0,0),xe(a),S.debug("getBlocks",JSON.stringify(a,null,2));const{minX:d,minY:c,maxX:n,maxY:g}=Se(a),l=g-c,f=n-d;return{x:d,y:c,width:f,height:l}}const bt=function(e,a){return a.db.getClasses()},xt=async function(e,a,d,c){const{securityLevel:n,block:g}=se(),l=c.db;let f;n==="sandbox"&&(f=H("#i"+a));const b=n==="sandbox"?H(f.nodes()[0].contentDocument.body):H("body"),p=n==="sandbox"?b.select(`[id="${a}"]`):H(`[id="${a}"]`);ke(p,["point","circle","cross"],c.type,a);const y=l.getBlocks(),T=l.getBlocksFlat(),v=l.getEdges(),N=p.insert("g").attr("class","block");await ut(N,y,l);const E=ft(l);if(await dt(N,y,l),await ht(N,v,T,l,a),E){const L=E,A=Math.max(1,Math.round(.125*(L.width/L.height))),k=L.height+A+10,D=L.width+10,{useMaxWidth:o}=g;ye(p,k,D,!!o),S.debug("Here Bounds",E,L),p.attr("viewBox",`${L.x-5} ${L.y-5} ${L.width+10} ${L.height+10}`)}Ae(Be)},St={draw:xt,getClasses:bt},Tt={parser:Pe,db:nt,renderer:St,styles:lt};export{Tt as diagram};
|
frontend-dist/assets/c4Diagram-c83219d4-Dwk4T9_E.js
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import{s as we,g as Oe,a as Te,b as Re,c as Dt,d as Nt,l as le,e as De,f as Se,h as wt,i as ue,j as Pe,w as Me,k as Kt,m as oe}from"./index-BCNM9-Ly.js";import{d as Le,g as Ne}from"./svgDrawCommon-b86b1483-KNrWL8cU.js";var Yt=function(){var e=function(bt,_,x,m){for(x=x||{},m=bt.length;m--;x[bt[m]]=_);return x},t=[1,24],a=[1,25],o=[1,26],l=[1,27],i=[1,28],s=[1,63],r=[1,64],n=[1,65],h=[1,66],f=[1,67],d=[1,68],p=[1,69],E=[1,29],O=[1,30],R=[1,31],S=[1,32],L=[1,33],Y=[1,34],Q=[1,35],H=[1,36],q=[1,37],G=[1,38],K=[1,39],J=[1,40],Z=[1,41],$=[1,42],tt=[1,43],et=[1,44],it=[1,45],nt=[1,46],st=[1,47],at=[1,48],rt=[1,50],lt=[1,51],ot=[1,52],ct=[1,53],ht=[1,54],ut=[1,55],dt=[1,56],ft=[1,57],pt=[1,58],yt=[1,59],gt=[1,60],At=[14,42],Vt=[14,34,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74],Ot=[12,14,34,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74],v=[1,82],k=[1,83],A=[1,84],C=[1,85],w=[12,14,42],ne=[12,14,33,42],Pt=[12,14,33,42,76,77,79,80],mt=[12,33],zt=[34,36,37,38,39,40,41,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74],Xt={trace:function(){},yy:{},symbols_:{error:2,start:3,mermaidDoc:4,direction:5,direction_tb:6,direction_bt:7,direction_rl:8,direction_lr:9,graphConfig:10,C4_CONTEXT:11,NEWLINE:12,statements:13,EOF:14,C4_CONTAINER:15,C4_COMPONENT:16,C4_DYNAMIC:17,C4_DEPLOYMENT:18,otherStatements:19,diagramStatements:20,otherStatement:21,title:22,accDescription:23,acc_title:24,acc_title_value:25,acc_descr:26,acc_descr_value:27,acc_descr_multiline_value:28,boundaryStatement:29,boundaryStartStatement:30,boundaryStopStatement:31,boundaryStart:32,LBRACE:33,ENTERPRISE_BOUNDARY:34,attributes:35,SYSTEM_BOUNDARY:36,BOUNDARY:37,CONTAINER_BOUNDARY:38,NODE:39,NODE_L:40,NODE_R:41,RBRACE:42,diagramStatement:43,PERSON:44,PERSON_EXT:45,SYSTEM:46,SYSTEM_DB:47,SYSTEM_QUEUE:48,SYSTEM_EXT:49,SYSTEM_EXT_DB:50,SYSTEM_EXT_QUEUE:51,CONTAINER:52,CONTAINER_DB:53,CONTAINER_QUEUE:54,CONTAINER_EXT:55,CONTAINER_EXT_DB:56,CONTAINER_EXT_QUEUE:57,COMPONENT:58,COMPONENT_DB:59,COMPONENT_QUEUE:60,COMPONENT_EXT:61,COMPONENT_EXT_DB:62,COMPONENT_EXT_QUEUE:63,REL:64,BIREL:65,REL_U:66,REL_D:67,REL_L:68,REL_R:69,REL_B:70,REL_INDEX:71,UPDATE_EL_STYLE:72,UPDATE_REL_STYLE:73,UPDATE_LAYOUT_CONFIG:74,attribute:75,STR:76,STR_KEY:77,STR_VALUE:78,ATTRIBUTE:79,ATTRIBUTE_EMPTY:80,$accept:0,$end:1},terminals_:{2:"error",6:"direction_tb",7:"direction_bt",8:"direction_rl",9:"direction_lr",11:"C4_CONTEXT",12:"NEWLINE",14:"EOF",15:"C4_CONTAINER",16:"C4_COMPONENT",17:"C4_DYNAMIC",18:"C4_DEPLOYMENT",22:"title",23:"accDescription",24:"acc_title",25:"acc_title_value",26:"acc_descr",27:"acc_descr_value",28:"acc_descr_multiline_value",33:"LBRACE",34:"ENTERPRISE_BOUNDARY",36:"SYSTEM_BOUNDARY",37:"BOUNDARY",38:"CONTAINER_BOUNDARY",39:"NODE",40:"NODE_L",41:"NODE_R",42:"RBRACE",44:"PERSON",45:"PERSON_EXT",46:"SYSTEM",47:"SYSTEM_DB",48:"SYSTEM_QUEUE",49:"SYSTEM_EXT",50:"SYSTEM_EXT_DB",51:"SYSTEM_EXT_QUEUE",52:"CONTAINER",53:"CONTAINER_DB",54:"CONTAINER_QUEUE",55:"CONTAINER_EXT",56:"CONTAINER_EXT_DB",57:"CONTAINER_EXT_QUEUE",58:"COMPONENT",59:"COMPONENT_DB",60:"COMPONENT_QUEUE",61:"COMPONENT_EXT",62:"COMPONENT_EXT_DB",63:"COMPONENT_EXT_QUEUE",64:"REL",65:"BIREL",66:"REL_U",67:"REL_D",68:"REL_L",69:"REL_R",70:"REL_B",71:"REL_INDEX",72:"UPDATE_EL_STYLE",73:"UPDATE_REL_STYLE",74:"UPDATE_LAYOUT_CONFIG",76:"STR",77:"STR_KEY",78:"STR_VALUE",79:"ATTRIBUTE",80:"ATTRIBUTE_EMPTY"},productions_:[0,[3,1],[3,1],[5,1],[5,1],[5,1],[5,1],[4,1],[10,4],[10,4],[10,4],[10,4],[10,4],[13,1],[13,1],[13,2],[19,1],[19,2],[19,3],[21,1],[21,1],[21,2],[21,2],[21,1],[29,3],[30,3],[30,3],[30,4],[32,2],[32,2],[32,2],[32,2],[32,2],[32,2],[32,2],[31,1],[20,1],[20,2],[20,3],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,1],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[35,1],[35,2],[75,1],[75,2],[75,1],[75,1]],performAction:function(_,x,m,g,T,u,Tt){var y=u.length-1;switch(T){case 3:g.setDirection("TB");break;case 4:g.setDirection("BT");break;case 5:g.setDirection("RL");break;case 6:g.setDirection("LR");break;case 8:case 9:case 10:case 11:case 12:g.setC4Type(u[y-3]);break;case 19:g.setTitle(u[y].substring(6)),this.$=u[y].substring(6);break;case 20:g.setAccDescription(u[y].substring(15)),this.$=u[y].substring(15);break;case 21:this.$=u[y].trim(),g.setTitle(this.$);break;case 22:case 23:this.$=u[y].trim(),g.setAccDescription(this.$);break;case 28:case 29:u[y].splice(2,0,"ENTERPRISE"),g.addPersonOrSystemBoundary(...u[y]),this.$=u[y];break;case 30:g.addPersonOrSystemBoundary(...u[y]),this.$=u[y];break;case 31:u[y].splice(2,0,"CONTAINER"),g.addContainerBoundary(...u[y]),this.$=u[y];break;case 32:g.addDeploymentNode("node",...u[y]),this.$=u[y];break;case 33:g.addDeploymentNode("nodeL",...u[y]),this.$=u[y];break;case 34:g.addDeploymentNode("nodeR",...u[y]),this.$=u[y];break;case 35:g.popBoundaryParseStack();break;case 39:g.addPersonOrSystem("person",...u[y]),this.$=u[y];break;case 40:g.addPersonOrSystem("external_person",...u[y]),this.$=u[y];break;case 41:g.addPersonOrSystem("system",...u[y]),this.$=u[y];break;case 42:g.addPersonOrSystem("system_db",...u[y]),this.$=u[y];break;case 43:g.addPersonOrSystem("system_queue",...u[y]),this.$=u[y];break;case 44:g.addPersonOrSystem("external_system",...u[y]),this.$=u[y];break;case 45:g.addPersonOrSystem("external_system_db",...u[y]),this.$=u[y];break;case 46:g.addPersonOrSystem("external_system_queue",...u[y]),this.$=u[y];break;case 47:g.addContainer("container",...u[y]),this.$=u[y];break;case 48:g.addContainer("container_db",...u[y]),this.$=u[y];break;case 49:g.addContainer("container_queue",...u[y]),this.$=u[y];break;case 50:g.addContainer("external_container",...u[y]),this.$=u[y];break;case 51:g.addContainer("external_container_db",...u[y]),this.$=u[y];break;case 52:g.addContainer("external_container_queue",...u[y]),this.$=u[y];break;case 53:g.addComponent("component",...u[y]),this.$=u[y];break;case 54:g.addComponent("component_db",...u[y]),this.$=u[y];break;case 55:g.addComponent("component_queue",...u[y]),this.$=u[y];break;case 56:g.addComponent("external_component",...u[y]),this.$=u[y];break;case 57:g.addComponent("external_component_db",...u[y]),this.$=u[y];break;case 58:g.addComponent("external_component_queue",...u[y]),this.$=u[y];break;case 60:g.addRel("rel",...u[y]),this.$=u[y];break;case 61:g.addRel("birel",...u[y]),this.$=u[y];break;case 62:g.addRel("rel_u",...u[y]),this.$=u[y];break;case 63:g.addRel("rel_d",...u[y]),this.$=u[y];break;case 64:g.addRel("rel_l",...u[y]),this.$=u[y];break;case 65:g.addRel("rel_r",...u[y]),this.$=u[y];break;case 66:g.addRel("rel_b",...u[y]),this.$=u[y];break;case 67:u[y].splice(0,1),g.addRel("rel",...u[y]),this.$=u[y];break;case 68:g.updateElStyle("update_el_style",...u[y]),this.$=u[y];break;case 69:g.updateRelStyle("update_rel_style",...u[y]),this.$=u[y];break;case 70:g.updateLayoutConfig("update_layout_config",...u[y]),this.$=u[y];break;case 71:this.$=[u[y]];break;case 72:u[y].unshift(u[y-1]),this.$=u[y];break;case 73:case 75:this.$=u[y].trim();break;case 74:let Et={};Et[u[y-1].trim()]=u[y].trim(),this.$=Et;break;case 76:this.$="";break}},table:[{3:1,4:2,5:3,6:[1,5],7:[1,6],8:[1,7],9:[1,8],10:4,11:[1,9],15:[1,10],16:[1,11],17:[1,12],18:[1,13]},{1:[3]},{1:[2,1]},{1:[2,2]},{1:[2,7]},{1:[2,3]},{1:[2,4]},{1:[2,5]},{1:[2,6]},{12:[1,14]},{12:[1,15]},{12:[1,16]},{12:[1,17]},{12:[1,18]},{13:19,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{13:70,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{13:71,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{13:72,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{13:73,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{14:[1,74]},e(At,[2,13],{43:23,29:49,30:61,32:62,20:75,34:s,36:r,37:n,38:h,39:f,40:d,41:p,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt}),e(At,[2,14]),e(Vt,[2,16],{12:[1,76]}),e(At,[2,36],{12:[1,77]}),e(Ot,[2,19]),e(Ot,[2,20]),{25:[1,78]},{27:[1,79]},e(Ot,[2,23]),{35:80,75:81,76:v,77:k,79:A,80:C},{35:86,75:81,76:v,77:k,79:A,80:C},{35:87,75:81,76:v,77:k,79:A,80:C},{35:88,75:81,76:v,77:k,79:A,80:C},{35:89,75:81,76:v,77:k,79:A,80:C},{35:90,75:81,76:v,77:k,79:A,80:C},{35:91,75:81,76:v,77:k,79:A,80:C},{35:92,75:81,76:v,77:k,79:A,80:C},{35:93,75:81,76:v,77:k,79:A,80:C},{35:94,75:81,76:v,77:k,79:A,80:C},{35:95,75:81,76:v,77:k,79:A,80:C},{35:96,75:81,76:v,77:k,79:A,80:C},{35:97,75:81,76:v,77:k,79:A,80:C},{35:98,75:81,76:v,77:k,79:A,80:C},{35:99,75:81,76:v,77:k,79:A,80:C},{35:100,75:81,76:v,77:k,79:A,80:C},{35:101,75:81,76:v,77:k,79:A,80:C},{35:102,75:81,76:v,77:k,79:A,80:C},{35:103,75:81,76:v,77:k,79:A,80:C},{35:104,75:81,76:v,77:k,79:A,80:C},e(w,[2,59]),{35:105,75:81,76:v,77:k,79:A,80:C},{35:106,75:81,76:v,77:k,79:A,80:C},{35:107,75:81,76:v,77:k,79:A,80:C},{35:108,75:81,76:v,77:k,79:A,80:C},{35:109,75:81,76:v,77:k,79:A,80:C},{35:110,75:81,76:v,77:k,79:A,80:C},{35:111,75:81,76:v,77:k,79:A,80:C},{35:112,75:81,76:v,77:k,79:A,80:C},{35:113,75:81,76:v,77:k,79:A,80:C},{35:114,75:81,76:v,77:k,79:A,80:C},{35:115,75:81,76:v,77:k,79:A,80:C},{20:116,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{12:[1,118],33:[1,117]},{35:119,75:81,76:v,77:k,79:A,80:C},{35:120,75:81,76:v,77:k,79:A,80:C},{35:121,75:81,76:v,77:k,79:A,80:C},{35:122,75:81,76:v,77:k,79:A,80:C},{35:123,75:81,76:v,77:k,79:A,80:C},{35:124,75:81,76:v,77:k,79:A,80:C},{35:125,75:81,76:v,77:k,79:A,80:C},{14:[1,126]},{14:[1,127]},{14:[1,128]},{14:[1,129]},{1:[2,8]},e(At,[2,15]),e(Vt,[2,17],{21:22,19:130,22:t,23:a,24:o,26:l,28:i}),e(At,[2,37],{19:20,20:21,21:22,43:23,29:49,30:61,32:62,13:131,22:t,23:a,24:o,26:l,28:i,34:s,36:r,37:n,38:h,39:f,40:d,41:p,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt}),e(Ot,[2,21]),e(Ot,[2,22]),e(w,[2,39]),e(ne,[2,71],{75:81,35:132,76:v,77:k,79:A,80:C}),e(Pt,[2,73]),{78:[1,133]},e(Pt,[2,75]),e(Pt,[2,76]),e(w,[2,40]),e(w,[2,41]),e(w,[2,42]),e(w,[2,43]),e(w,[2,44]),e(w,[2,45]),e(w,[2,46]),e(w,[2,47]),e(w,[2,48]),e(w,[2,49]),e(w,[2,50]),e(w,[2,51]),e(w,[2,52]),e(w,[2,53]),e(w,[2,54]),e(w,[2,55]),e(w,[2,56]),e(w,[2,57]),e(w,[2,58]),e(w,[2,60]),e(w,[2,61]),e(w,[2,62]),e(w,[2,63]),e(w,[2,64]),e(w,[2,65]),e(w,[2,66]),e(w,[2,67]),e(w,[2,68]),e(w,[2,69]),e(w,[2,70]),{31:134,42:[1,135]},{12:[1,136]},{33:[1,137]},e(mt,[2,28]),e(mt,[2,29]),e(mt,[2,30]),e(mt,[2,31]),e(mt,[2,32]),e(mt,[2,33]),e(mt,[2,34]),{1:[2,9]},{1:[2,10]},{1:[2,11]},{1:[2,12]},e(Vt,[2,18]),e(At,[2,38]),e(ne,[2,72]),e(Pt,[2,74]),e(w,[2,24]),e(w,[2,35]),e(zt,[2,25]),e(zt,[2,26],{12:[1,138]}),e(zt,[2,27])],defaultActions:{2:[2,1],3:[2,2],4:[2,7],5:[2,3],6:[2,4],7:[2,5],8:[2,6],74:[2,8],126:[2,9],127:[2,10],128:[2,11],129:[2,12]},parseError:function(_,x){if(x.recoverable)this.trace(_);else{var m=new Error(_);throw m.hash=x,m}},parse:function(_){var x=this,m=[0],g=[],T=[null],u=[],Tt=this.table,y="",Et=0,se=0,ve=2,ae=1,ke=u.slice.call(arguments,1),D=Object.create(this.lexer),vt={yy:{}};for(var Qt in this.yy)Object.prototype.hasOwnProperty.call(this.yy,Qt)&&(vt.yy[Qt]=this.yy[Qt]);D.setInput(_,vt.yy),vt.yy.lexer=D,vt.yy.parser=this,typeof D.yylloc>"u"&&(D.yylloc={});var Ht=D.yylloc;u.push(Ht);var Ae=D.options&&D.options.ranges;typeof vt.yy.parseError=="function"?this.parseError=vt.yy.parseError:this.parseError=Object.getPrototypeOf(this).parseError;function Ce(){var X;return X=g.pop()||D.lex()||ae,typeof X!="number"&&(X instanceof Array&&(g=X,X=g.pop()),X=x.symbols_[X]||X),X}for(var M,kt,N,qt,Ct={},Mt,z,re,Lt;;){if(kt=m[m.length-1],this.defaultActions[kt]?N=this.defaultActions[kt]:((M===null||typeof M>"u")&&(M=Ce()),N=Tt[kt]&&Tt[kt][M]),typeof N>"u"||!N.length||!N[0]){var Gt="";Lt=[];for(Mt in Tt[kt])this.terminals_[Mt]&&Mt>ve&&Lt.push("'"+this.terminals_[Mt]+"'");D.showPosition?Gt="Parse error on line "+(Et+1)+`:
|
| 2 |
+
`+D.showPosition()+`
|
| 3 |
+
Expecting `+Lt.join(", ")+", got '"+(this.terminals_[M]||M)+"'":Gt="Parse error on line "+(Et+1)+": Unexpected "+(M==ae?"end of input":"'"+(this.terminals_[M]||M)+"'"),this.parseError(Gt,{text:D.match,token:this.terminals_[M]||M,line:D.yylineno,loc:Ht,expected:Lt})}if(N[0]instanceof Array&&N.length>1)throw new Error("Parse Error: multiple actions possible at state: "+kt+", token: "+M);switch(N[0]){case 1:m.push(M),T.push(D.yytext),u.push(D.yylloc),m.push(N[1]),M=null,se=D.yyleng,y=D.yytext,Et=D.yylineno,Ht=D.yylloc;break;case 2:if(z=this.productions_[N[1]][1],Ct.$=T[T.length-z],Ct._$={first_line:u[u.length-(z||1)].first_line,last_line:u[u.length-1].last_line,first_column:u[u.length-(z||1)].first_column,last_column:u[u.length-1].last_column},Ae&&(Ct._$.range=[u[u.length-(z||1)].range[0],u[u.length-1].range[1]]),qt=this.performAction.apply(Ct,[y,se,Et,vt.yy,N[1],T,u].concat(ke)),typeof qt<"u")return qt;z&&(m=m.slice(0,-1*z*2),T=T.slice(0,-1*z),u=u.slice(0,-1*z)),m.push(this.productions_[N[1]][0]),T.push(Ct.$),u.push(Ct._$),re=Tt[m[m.length-2]][m[m.length-1]],m.push(re);break;case 3:return!0}}return!0}},Ee=function(){var bt={EOF:1,parseError:function(x,m){if(this.yy.parser)this.yy.parser.parseError(x,m);else throw new Error(x)},setInput:function(_,x){return this.yy=x||this.yy||{},this._input=_,this._more=this._backtrack=this.done=!1,this.yylineno=this.yyleng=0,this.yytext=this.matched=this.match="",this.conditionStack=["INITIAL"],this.yylloc={first_line:1,first_column:0,last_line:1,last_column:0},this.options.ranges&&(this.yylloc.range=[0,0]),this.offset=0,this},input:function(){var _=this._input[0];this.yytext+=_,this.yyleng++,this.offset++,this.match+=_,this.matched+=_;var x=_.match(/(?:\r\n?|\n).*/g);return x?(this.yylineno++,this.yylloc.last_line++):this.yylloc.last_column++,this.options.ranges&&this.yylloc.range[1]++,this._input=this._input.slice(1),_},unput:function(_){var x=_.length,m=_.split(/(?:\r\n?|\n)/g);this._input=_+this._input,this.yytext=this.yytext.substr(0,this.yytext.length-x),this.offset-=x;var g=this.match.split(/(?:\r\n?|\n)/g);this.match=this.match.substr(0,this.match.length-1),this.matched=this.matched.substr(0,this.matched.length-1),m.length-1&&(this.yylineno-=m.length-1);var T=this.yylloc.range;return this.yylloc={first_line:this.yylloc.first_line,last_line:this.yylineno+1,first_column:this.yylloc.first_column,last_column:m?(m.length===g.length?this.yylloc.first_column:0)+g[g.length-m.length].length-m[0].length:this.yylloc.first_column-x},this.options.ranges&&(this.yylloc.range=[T[0],T[0]+this.yyleng-x]),this.yyleng=this.yytext.length,this},more:function(){return this._more=!0,this},reject:function(){if(this.options.backtrack_lexer)this._backtrack=!0;else return this.parseError("Lexical error on line "+(this.yylineno+1)+`. You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).
|
| 4 |
+
`+this.showPosition(),{text:"",token:null,line:this.yylineno});return this},less:function(_){this.unput(this.match.slice(_))},pastInput:function(){var _=this.matched.substr(0,this.matched.length-this.match.length);return(_.length>20?"...":"")+_.substr(-20).replace(/\n/g,"")},upcomingInput:function(){var _=this.match;return _.length<20&&(_+=this._input.substr(0,20-_.length)),(_.substr(0,20)+(_.length>20?"...":"")).replace(/\n/g,"")},showPosition:function(){var _=this.pastInput(),x=new Array(_.length+1).join("-");return _+this.upcomingInput()+`
|
| 5 |
+
`+x+"^"},test_match:function(_,x){var m,g,T;if(this.options.backtrack_lexer&&(T={yylineno:this.yylineno,yylloc:{first_line:this.yylloc.first_line,last_line:this.last_line,first_column:this.yylloc.first_column,last_column:this.yylloc.last_column},yytext:this.yytext,match:this.match,matches:this.matches,matched:this.matched,yyleng:this.yyleng,offset:this.offset,_more:this._more,_input:this._input,yy:this.yy,conditionStack:this.conditionStack.slice(0),done:this.done},this.options.ranges&&(T.yylloc.range=this.yylloc.range.slice(0))),g=_[0].match(/(?:\r\n?|\n).*/g),g&&(this.yylineno+=g.length),this.yylloc={first_line:this.yylloc.last_line,last_line:this.yylineno+1,first_column:this.yylloc.last_column,last_column:g?g[g.length-1].length-g[g.length-1].match(/\r?\n?/)[0].length:this.yylloc.last_column+_[0].length},this.yytext+=_[0],this.match+=_[0],this.matches=_,this.yyleng=this.yytext.length,this.options.ranges&&(this.yylloc.range=[this.offset,this.offset+=this.yyleng]),this._more=!1,this._backtrack=!1,this._input=this._input.slice(_[0].length),this.matched+=_[0],m=this.performAction.call(this,this.yy,this,x,this.conditionStack[this.conditionStack.length-1]),this.done&&this._input&&(this.done=!1),m)return m;if(this._backtrack){for(var u in T)this[u]=T[u];return!1}return!1},next:function(){if(this.done)return this.EOF;this._input||(this.done=!0);var _,x,m,g;this._more||(this.yytext="",this.match="");for(var T=this._currentRules(),u=0;u<T.length;u++)if(m=this._input.match(this.rules[T[u]]),m&&(!x||m[0].length>x[0].length)){if(x=m,g=u,this.options.backtrack_lexer){if(_=this.test_match(m,T[u]),_!==!1)return _;if(this._backtrack){x=!1;continue}else return!1}else if(!this.options.flex)break}return x?(_=this.test_match(x,T[g]),_!==!1?_:!1):this._input===""?this.EOF:this.parseError("Lexical error on line "+(this.yylineno+1)+`. Unrecognized text.
|
| 6 |
+
`+this.showPosition(),{text:"",token:null,line:this.yylineno})},lex:function(){var x=this.next();return x||this.lex()},begin:function(x){this.conditionStack.push(x)},popState:function(){var x=this.conditionStack.length-1;return x>0?this.conditionStack.pop():this.conditionStack[0]},_currentRules:function(){return this.conditionStack.length&&this.conditionStack[this.conditionStack.length-1]?this.conditions[this.conditionStack[this.conditionStack.length-1]].rules:this.conditions.INITIAL.rules},topState:function(x){return x=this.conditionStack.length-1-Math.abs(x||0),x>=0?this.conditionStack[x]:"INITIAL"},pushState:function(x){this.begin(x)},stateStackSize:function(){return this.conditionStack.length},options:{},performAction:function(x,m,g,T){switch(g){case 0:return 6;case 1:return 7;case 2:return 8;case 3:return 9;case 4:return 22;case 5:return 23;case 6:return this.begin("acc_title"),24;case 7:return this.popState(),"acc_title_value";case 8:return this.begin("acc_descr"),26;case 9:return this.popState(),"acc_descr_value";case 10:this.begin("acc_descr_multiline");break;case 11:this.popState();break;case 12:return"acc_descr_multiline_value";case 13:break;case 14:c;break;case 15:return 12;case 16:break;case 17:return 11;case 18:return 15;case 19:return 16;case 20:return 17;case 21:return 18;case 22:return this.begin("person_ext"),45;case 23:return this.begin("person"),44;case 24:return this.begin("system_ext_queue"),51;case 25:return this.begin("system_ext_db"),50;case 26:return this.begin("system_ext"),49;case 27:return this.begin("system_queue"),48;case 28:return this.begin("system_db"),47;case 29:return this.begin("system"),46;case 30:return this.begin("boundary"),37;case 31:return this.begin("enterprise_boundary"),34;case 32:return this.begin("system_boundary"),36;case 33:return this.begin("container_ext_queue"),57;case 34:return this.begin("container_ext_db"),56;case 35:return this.begin("container_ext"),55;case 36:return this.begin("container_queue"),54;case 37:return this.begin("container_db"),53;case 38:return this.begin("container"),52;case 39:return this.begin("container_boundary"),38;case 40:return this.begin("component_ext_queue"),63;case 41:return this.begin("component_ext_db"),62;case 42:return this.begin("component_ext"),61;case 43:return this.begin("component_queue"),60;case 44:return this.begin("component_db"),59;case 45:return this.begin("component"),58;case 46:return this.begin("node"),39;case 47:return this.begin("node"),39;case 48:return this.begin("node_l"),40;case 49:return this.begin("node_r"),41;case 50:return this.begin("rel"),64;case 51:return this.begin("birel"),65;case 52:return this.begin("rel_u"),66;case 53:return this.begin("rel_u"),66;case 54:return this.begin("rel_d"),67;case 55:return this.begin("rel_d"),67;case 56:return this.begin("rel_l"),68;case 57:return this.begin("rel_l"),68;case 58:return this.begin("rel_r"),69;case 59:return this.begin("rel_r"),69;case 60:return this.begin("rel_b"),70;case 61:return this.begin("rel_index"),71;case 62:return this.begin("update_el_style"),72;case 63:return this.begin("update_rel_style"),73;case 64:return this.begin("update_layout_config"),74;case 65:return"EOF_IN_STRUCT";case 66:return this.begin("attribute"),"ATTRIBUTE_EMPTY";case 67:this.begin("attribute");break;case 68:this.popState(),this.popState();break;case 69:return 80;case 70:break;case 71:return 80;case 72:this.begin("string");break;case 73:this.popState();break;case 74:return"STR";case 75:this.begin("string_kv");break;case 76:return this.begin("string_kv_key"),"STR_KEY";case 77:this.popState(),this.begin("string_kv_value");break;case 78:return"STR_VALUE";case 79:this.popState(),this.popState();break;case 80:return"STR";case 81:return"LBRACE";case 82:return"RBRACE";case 83:return"SPACE";case 84:return"EOL";case 85:return 14}},rules:[/^(?:.*direction\s+TB[^\n]*)/,/^(?:.*direction\s+BT[^\n]*)/,/^(?:.*direction\s+RL[^\n]*)/,/^(?:.*direction\s+LR[^\n]*)/,/^(?:title\s[^#\n;]+)/,/^(?:accDescription\s[^#\n;]+)/,/^(?:accTitle\s*:\s*)/,/^(?:(?!\n||)*[^\n]*)/,/^(?:accDescr\s*:\s*)/,/^(?:(?!\n||)*[^\n]*)/,/^(?:accDescr\s*\{\s*)/,/^(?:[\}])/,/^(?:[^\}]*)/,/^(?:%%(?!\{)*[^\n]*(\r?\n?)+)/,/^(?:%%[^\n]*(\r?\n)*)/,/^(?:\s*(\r?\n)+)/,/^(?:\s+)/,/^(?:C4Context\b)/,/^(?:C4Container\b)/,/^(?:C4Component\b)/,/^(?:C4Dynamic\b)/,/^(?:C4Deployment\b)/,/^(?:Person_Ext\b)/,/^(?:Person\b)/,/^(?:SystemQueue_Ext\b)/,/^(?:SystemDb_Ext\b)/,/^(?:System_Ext\b)/,/^(?:SystemQueue\b)/,/^(?:SystemDb\b)/,/^(?:System\b)/,/^(?:Boundary\b)/,/^(?:Enterprise_Boundary\b)/,/^(?:System_Boundary\b)/,/^(?:ContainerQueue_Ext\b)/,/^(?:ContainerDb_Ext\b)/,/^(?:Container_Ext\b)/,/^(?:ContainerQueue\b)/,/^(?:ContainerDb\b)/,/^(?:Container\b)/,/^(?:Container_Boundary\b)/,/^(?:ComponentQueue_Ext\b)/,/^(?:ComponentDb_Ext\b)/,/^(?:Component_Ext\b)/,/^(?:ComponentQueue\b)/,/^(?:ComponentDb\b)/,/^(?:Component\b)/,/^(?:Deployment_Node\b)/,/^(?:Node\b)/,/^(?:Node_L\b)/,/^(?:Node_R\b)/,/^(?:Rel\b)/,/^(?:BiRel\b)/,/^(?:Rel_Up\b)/,/^(?:Rel_U\b)/,/^(?:Rel_Down\b)/,/^(?:Rel_D\b)/,/^(?:Rel_Left\b)/,/^(?:Rel_L\b)/,/^(?:Rel_Right\b)/,/^(?:Rel_R\b)/,/^(?:Rel_Back\b)/,/^(?:RelIndex\b)/,/^(?:UpdateElementStyle\b)/,/^(?:UpdateRelStyle\b)/,/^(?:UpdateLayoutConfig\b)/,/^(?:$)/,/^(?:[(][ ]*[,])/,/^(?:[(])/,/^(?:[)])/,/^(?:,,)/,/^(?:,)/,/^(?:[ ]*["]["])/,/^(?:[ ]*["])/,/^(?:["])/,/^(?:[^"]*)/,/^(?:[ ]*[\$])/,/^(?:[^=]*)/,/^(?:[=][ ]*["])/,/^(?:[^"]+)/,/^(?:["])/,/^(?:[^,]+)/,/^(?:\{)/,/^(?:\})/,/^(?:[\s]+)/,/^(?:[\n\r]+)/,/^(?:$)/],conditions:{acc_descr_multiline:{rules:[11,12],inclusive:!1},acc_descr:{rules:[9],inclusive:!1},acc_title:{rules:[7],inclusive:!1},string_kv_value:{rules:[78,79],inclusive:!1},string_kv_key:{rules:[77],inclusive:!1},string_kv:{rules:[76],inclusive:!1},string:{rules:[73,74],inclusive:!1},attribute:{rules:[68,69,70,71,72,75,80],inclusive:!1},update_layout_config:{rules:[65,66,67,68],inclusive:!1},update_rel_style:{rules:[65,66,67,68],inclusive:!1},update_el_style:{rules:[65,66,67,68],inclusive:!1},rel_b:{rules:[65,66,67,68],inclusive:!1},rel_r:{rules:[65,66,67,68],inclusive:!1},rel_l:{rules:[65,66,67,68],inclusive:!1},rel_d:{rules:[65,66,67,68],inclusive:!1},rel_u:{rules:[65,66,67,68],inclusive:!1},rel_bi:{rules:[],inclusive:!1},rel:{rules:[65,66,67,68],inclusive:!1},node_r:{rules:[65,66,67,68],inclusive:!1},node_l:{rules:[65,66,67,68],inclusive:!1},node:{rules:[65,66,67,68],inclusive:!1},index:{rules:[],inclusive:!1},rel_index:{rules:[65,66,67,68],inclusive:!1},component_ext_queue:{rules:[],inclusive:!1},component_ext_db:{rules:[65,66,67,68],inclusive:!1},component_ext:{rules:[65,66,67,68],inclusive:!1},component_queue:{rules:[65,66,67,68],inclusive:!1},component_db:{rules:[65,66,67,68],inclusive:!1},component:{rules:[65,66,67,68],inclusive:!1},container_boundary:{rules:[65,66,67,68],inclusive:!1},container_ext_queue:{rules:[65,66,67,68],inclusive:!1},container_ext_db:{rules:[65,66,67,68],inclusive:!1},container_ext:{rules:[65,66,67,68],inclusive:!1},container_queue:{rules:[65,66,67,68],inclusive:!1},container_db:{rules:[65,66,67,68],inclusive:!1},container:{rules:[65,66,67,68],inclusive:!1},birel:{rules:[65,66,67,68],inclusive:!1},system_boundary:{rules:[65,66,67,68],inclusive:!1},enterprise_boundary:{rules:[65,66,67,68],inclusive:!1},boundary:{rules:[65,66,67,68],inclusive:!1},system_ext_queue:{rules:[65,66,67,68],inclusive:!1},system_ext_db:{rules:[65,66,67,68],inclusive:!1},system_ext:{rules:[65,66,67,68],inclusive:!1},system_queue:{rules:[65,66,67,68],inclusive:!1},system_db:{rules:[65,66,67,68],inclusive:!1},system:{rules:[65,66,67,68],inclusive:!1},person_ext:{rules:[65,66,67,68],inclusive:!1},person:{rules:[65,66,67,68],inclusive:!1},INITIAL:{rules:[0,1,2,3,4,5,6,8,10,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,81,82,83,84,85],inclusive:!0}}};return bt}();Xt.lexer=Ee;function Wt(){this.yy={}}return Wt.prototype=Xt,Xt.Parser=Wt,new Wt}();Yt.parser=Yt;const Be=Yt;let U=[],_t=[""],P="global",j="",V=[{alias:"global",label:{text:"global"},type:{text:"global"},tags:null,link:null,parentBoundary:""}],St=[],te="",ee=!1,It=4,jt=2;var de;const Ye=function(){return de},Ie=function(e){de=ue(e,Dt())},je=function(e,t,a,o,l,i,s,r,n){if(e==null||t===void 0||t===null||a===void 0||a===null||o===void 0||o===null)return;let h={};const f=St.find(d=>d.from===t&&d.to===a);if(f?h=f:St.push(h),h.type=e,h.from=t,h.to=a,h.label={text:o},l==null)h.techn={text:""};else if(typeof l=="object"){let[d,p]=Object.entries(l)[0];h[d]={text:p}}else h.techn={text:l};if(i==null)h.descr={text:""};else if(typeof i=="object"){let[d,p]=Object.entries(i)[0];h[d]={text:p}}else h.descr={text:i};if(typeof s=="object"){let[d,p]=Object.entries(s)[0];h[d]=p}else h.sprite=s;if(typeof r=="object"){let[d,p]=Object.entries(r)[0];h[d]=p}else h.tags=r;if(typeof n=="object"){let[d,p]=Object.entries(n)[0];h[d]=p}else h.link=n;h.wrap=xt()},Ue=function(e,t,a,o,l,i,s){if(t===null||a===null)return;let r={};const n=U.find(h=>h.alias===t);if(n&&t===n.alias?r=n:(r.alias=t,U.push(r)),a==null?r.label={text:""}:r.label={text:a},o==null)r.descr={text:""};else if(typeof o=="object"){let[h,f]=Object.entries(o)[0];r[h]={text:f}}else r.descr={text:o};if(typeof l=="object"){let[h,f]=Object.entries(l)[0];r[h]=f}else r.sprite=l;if(typeof i=="object"){let[h,f]=Object.entries(i)[0];r[h]=f}else r.tags=i;if(typeof s=="object"){let[h,f]=Object.entries(s)[0];r[h]=f}else r.link=s;r.typeC4Shape={text:e},r.parentBoundary=P,r.wrap=xt()},Fe=function(e,t,a,o,l,i,s,r){if(t===null||a===null)return;let n={};const h=U.find(f=>f.alias===t);if(h&&t===h.alias?n=h:(n.alias=t,U.push(n)),a==null?n.label={text:""}:n.label={text:a},o==null)n.techn={text:""};else if(typeof o=="object"){let[f,d]=Object.entries(o)[0];n[f]={text:d}}else n.techn={text:o};if(l==null)n.descr={text:""};else if(typeof l=="object"){let[f,d]=Object.entries(l)[0];n[f]={text:d}}else n.descr={text:l};if(typeof i=="object"){let[f,d]=Object.entries(i)[0];n[f]=d}else n.sprite=i;if(typeof s=="object"){let[f,d]=Object.entries(s)[0];n[f]=d}else n.tags=s;if(typeof r=="object"){let[f,d]=Object.entries(r)[0];n[f]=d}else n.link=r;n.wrap=xt(),n.typeC4Shape={text:e},n.parentBoundary=P},Ve=function(e,t,a,o,l,i,s,r){if(t===null||a===null)return;let n={};const h=U.find(f=>f.alias===t);if(h&&t===h.alias?n=h:(n.alias=t,U.push(n)),a==null?n.label={text:""}:n.label={text:a},o==null)n.techn={text:""};else if(typeof o=="object"){let[f,d]=Object.entries(o)[0];n[f]={text:d}}else n.techn={text:o};if(l==null)n.descr={text:""};else if(typeof l=="object"){let[f,d]=Object.entries(l)[0];n[f]={text:d}}else n.descr={text:l};if(typeof i=="object"){let[f,d]=Object.entries(i)[0];n[f]=d}else n.sprite=i;if(typeof s=="object"){let[f,d]=Object.entries(s)[0];n[f]=d}else n.tags=s;if(typeof r=="object"){let[f,d]=Object.entries(r)[0];n[f]=d}else n.link=r;n.wrap=xt(),n.typeC4Shape={text:e},n.parentBoundary=P},ze=function(e,t,a,o,l){if(e===null||t===null)return;let i={};const s=V.find(r=>r.alias===e);if(s&&e===s.alias?i=s:(i.alias=e,V.push(i)),t==null?i.label={text:""}:i.label={text:t},a==null)i.type={text:"system"};else if(typeof a=="object"){let[r,n]=Object.entries(a)[0];i[r]={text:n}}else i.type={text:a};if(typeof o=="object"){let[r,n]=Object.entries(o)[0];i[r]=n}else i.tags=o;if(typeof l=="object"){let[r,n]=Object.entries(l)[0];i[r]=n}else i.link=l;i.parentBoundary=P,i.wrap=xt(),j=P,P=e,_t.push(j)},Xe=function(e,t,a,o,l){if(e===null||t===null)return;let i={};const s=V.find(r=>r.alias===e);if(s&&e===s.alias?i=s:(i.alias=e,V.push(i)),t==null?i.label={text:""}:i.label={text:t},a==null)i.type={text:"container"};else if(typeof a=="object"){let[r,n]=Object.entries(a)[0];i[r]={text:n}}else i.type={text:a};if(typeof o=="object"){let[r,n]=Object.entries(o)[0];i[r]=n}else i.tags=o;if(typeof l=="object"){let[r,n]=Object.entries(l)[0];i[r]=n}else i.link=l;i.parentBoundary=P,i.wrap=xt(),j=P,P=e,_t.push(j)},We=function(e,t,a,o,l,i,s,r){if(t===null||a===null)return;let n={};const h=V.find(f=>f.alias===t);if(h&&t===h.alias?n=h:(n.alias=t,V.push(n)),a==null?n.label={text:""}:n.label={text:a},o==null)n.type={text:"node"};else if(typeof o=="object"){let[f,d]=Object.entries(o)[0];n[f]={text:d}}else n.type={text:o};if(l==null)n.descr={text:""};else if(typeof l=="object"){let[f,d]=Object.entries(l)[0];n[f]={text:d}}else n.descr={text:l};if(typeof s=="object"){let[f,d]=Object.entries(s)[0];n[f]=d}else n.tags=s;if(typeof r=="object"){let[f,d]=Object.entries(r)[0];n[f]=d}else n.link=r;n.nodeType=e,n.parentBoundary=P,n.wrap=xt(),j=P,P=t,_t.push(j)},Qe=function(){P=j,_t.pop(),j=_t.pop(),_t.push(j)},He=function(e,t,a,o,l,i,s,r,n,h,f){let d=U.find(p=>p.alias===t);if(!(d===void 0&&(d=V.find(p=>p.alias===t),d===void 0))){if(a!=null)if(typeof a=="object"){let[p,E]=Object.entries(a)[0];d[p]=E}else d.bgColor=a;if(o!=null)if(typeof o=="object"){let[p,E]=Object.entries(o)[0];d[p]=E}else d.fontColor=o;if(l!=null)if(typeof l=="object"){let[p,E]=Object.entries(l)[0];d[p]=E}else d.borderColor=l;if(i!=null)if(typeof i=="object"){let[p,E]=Object.entries(i)[0];d[p]=E}else d.shadowing=i;if(s!=null)if(typeof s=="object"){let[p,E]=Object.entries(s)[0];d[p]=E}else d.shape=s;if(r!=null)if(typeof r=="object"){let[p,E]=Object.entries(r)[0];d[p]=E}else d.sprite=r;if(n!=null)if(typeof n=="object"){let[p,E]=Object.entries(n)[0];d[p]=E}else d.techn=n;if(h!=null)if(typeof h=="object"){let[p,E]=Object.entries(h)[0];d[p]=E}else d.legendText=h;if(f!=null)if(typeof f=="object"){let[p,E]=Object.entries(f)[0];d[p]=E}else d.legendSprite=f}},qe=function(e,t,a,o,l,i,s){const r=St.find(n=>n.from===t&&n.to===a);if(r!==void 0){if(o!=null)if(typeof o=="object"){let[n,h]=Object.entries(o)[0];r[n]=h}else r.textColor=o;if(l!=null)if(typeof l=="object"){let[n,h]=Object.entries(l)[0];r[n]=h}else r.lineColor=l;if(i!=null)if(typeof i=="object"){let[n,h]=Object.entries(i)[0];r[n]=parseInt(h)}else r.offsetX=parseInt(i);if(s!=null)if(typeof s=="object"){let[n,h]=Object.entries(s)[0];r[n]=parseInt(h)}else r.offsetY=parseInt(s)}},Ge=function(e,t,a){let o=It,l=jt;if(typeof t=="object"){const i=Object.values(t)[0];o=parseInt(i)}else o=parseInt(t);if(typeof a=="object"){const i=Object.values(a)[0];l=parseInt(i)}else l=parseInt(a);o>=1&&(It=o),l>=1&&(jt=l)},Ke=function(){return It},Je=function(){return jt},Ze=function(){return P},$e=function(){return j},fe=function(e){return e==null?U:U.filter(t=>t.parentBoundary===e)},t0=function(e){return U.find(t=>t.alias===e)},e0=function(e){return Object.keys(fe(e))},pe=function(e){return e==null?V:V.filter(t=>t.parentBoundary===e)},i0=pe,n0=function(){return St},s0=function(){return te},a0=function(e){ee=e},xt=function(){return ee},r0=function(){U=[],V=[{alias:"global",label:{text:"global"},type:{text:"global"},tags:null,link:null,parentBoundary:""}],j="",P="global",_t=[""],St=[],_t=[""],te="",ee=!1,It=4,jt=2},l0={SOLID:0,DOTTED:1,NOTE:2,SOLID_CROSS:3,DOTTED_CROSS:4,SOLID_OPEN:5,DOTTED_OPEN:6,LOOP_START:10,LOOP_END:11,ALT_START:12,ALT_ELSE:13,ALT_END:14,OPT_START:15,OPT_END:16,ACTIVE_START:17,ACTIVE_END:18,PAR_START:19,PAR_AND:20,PAR_END:21,RECT_START:22,RECT_END:23,SOLID_POINT:24,DOTTED_POINT:25},o0={FILLED:0,OPEN:1},c0={LEFTOF:0,RIGHTOF:1,OVER:2},h0=function(e){te=ue(e,Dt())},Jt={addPersonOrSystem:Ue,addPersonOrSystemBoundary:ze,addContainer:Fe,addContainerBoundary:Xe,addComponent:Ve,addDeploymentNode:We,popBoundaryParseStack:Qe,addRel:je,updateElStyle:He,updateRelStyle:qe,updateLayoutConfig:Ge,autoWrap:xt,setWrap:a0,getC4ShapeArray:fe,getC4Shape:t0,getC4ShapeKeys:e0,getBoundaries:pe,getBoundarys:i0,getCurrentBoundaryParse:Ze,getParentBoundaryParse:$e,getRels:n0,getTitle:s0,getC4Type:Ye,getC4ShapeInRow:Ke,getC4BoundaryInRow:Je,setAccTitle:Re,getAccTitle:Te,getAccDescription:Oe,setAccDescription:we,getConfig:()=>Dt().c4,clear:r0,LINETYPE:l0,ARROWTYPE:o0,PLACEMENT:c0,setTitle:h0,setC4Type:Ie},ie=function(e,t){return Le(e,t)},ye=function(e,t,a,o,l,i){const s=e.append("image");s.attr("width",t),s.attr("height",a),s.attr("x",o),s.attr("y",l);let r=i.startsWith("data:image/png;base64")?i:Pe.sanitizeUrl(i);s.attr("xlink:href",r)},u0=(e,t,a)=>{const o=e.append("g");let l=0;for(let i of t){let s=i.textColor?i.textColor:"#444444",r=i.lineColor?i.lineColor:"#444444",n=i.offsetX?parseInt(i.offsetX):0,h=i.offsetY?parseInt(i.offsetY):0,f="";if(l===0){let p=o.append("line");p.attr("x1",i.startPoint.x),p.attr("y1",i.startPoint.y),p.attr("x2",i.endPoint.x),p.attr("y2",i.endPoint.y),p.attr("stroke-width","1"),p.attr("stroke",r),p.style("fill","none"),i.type!=="rel_b"&&p.attr("marker-end","url("+f+"#arrowhead)"),(i.type==="birel"||i.type==="rel_b")&&p.attr("marker-start","url("+f+"#arrowend)"),l=-1}else{let p=o.append("path");p.attr("fill","none").attr("stroke-width","1").attr("stroke",r).attr("d","Mstartx,starty Qcontrolx,controly stopx,stopy ".replaceAll("startx",i.startPoint.x).replaceAll("starty",i.startPoint.y).replaceAll("controlx",i.startPoint.x+(i.endPoint.x-i.startPoint.x)/2-(i.endPoint.x-i.startPoint.x)/4).replaceAll("controly",i.startPoint.y+(i.endPoint.y-i.startPoint.y)/2).replaceAll("stopx",i.endPoint.x).replaceAll("stopy",i.endPoint.y)),i.type!=="rel_b"&&p.attr("marker-end","url("+f+"#arrowhead)"),(i.type==="birel"||i.type==="rel_b")&&p.attr("marker-start","url("+f+"#arrowend)")}let d=a.messageFont();W(a)(i.label.text,o,Math.min(i.startPoint.x,i.endPoint.x)+Math.abs(i.endPoint.x-i.startPoint.x)/2+n,Math.min(i.startPoint.y,i.endPoint.y)+Math.abs(i.endPoint.y-i.startPoint.y)/2+h,i.label.width,i.label.height,{fill:s},d),i.techn&&i.techn.text!==""&&(d=a.messageFont(),W(a)("["+i.techn.text+"]",o,Math.min(i.startPoint.x,i.endPoint.x)+Math.abs(i.endPoint.x-i.startPoint.x)/2+n,Math.min(i.startPoint.y,i.endPoint.y)+Math.abs(i.endPoint.y-i.startPoint.y)/2+a.messageFontSize+5+h,Math.max(i.label.width,i.techn.width),i.techn.height,{fill:s,"font-style":"italic"},d))}},d0=function(e,t,a){const o=e.append("g");let l=t.bgColor?t.bgColor:"none",i=t.borderColor?t.borderColor:"#444444",s=t.fontColor?t.fontColor:"black",r={"stroke-width":1,"stroke-dasharray":"7.0,7.0"};t.nodeType&&(r={"stroke-width":1});let n={x:t.x,y:t.y,fill:l,stroke:i,width:t.width,height:t.height,rx:2.5,ry:2.5,attrs:r};ie(o,n);let h=a.boundaryFont();h.fontWeight="bold",h.fontSize=h.fontSize+2,h.fontColor=s,W(a)(t.label.text,o,t.x,t.y+t.label.Y,t.width,t.height,{fill:"#444444"},h),t.type&&t.type.text!==""&&(h=a.boundaryFont(),h.fontColor=s,W(a)(t.type.text,o,t.x,t.y+t.type.Y,t.width,t.height,{fill:"#444444"},h)),t.descr&&t.descr.text!==""&&(h=a.boundaryFont(),h.fontSize=h.fontSize-2,h.fontColor=s,W(a)(t.descr.text,o,t.x,t.y+t.descr.Y,t.width,t.height,{fill:"#444444"},h))},f0=function(e,t,a){var o;let l=t.bgColor?t.bgColor:a[t.typeC4Shape.text+"_bg_color"],i=t.borderColor?t.borderColor:a[t.typeC4Shape.text+"_border_color"],s=t.fontColor?t.fontColor:"#FFFFFF",r="";switch(t.typeC4Shape.text){case"person":r="";break;case"external_person":r="";break}const n=e.append("g");n.attr("class","person-man");const h=Ne();switch(t.typeC4Shape.text){case"person":case"external_person":case"system":case"external_system":case"container":case"external_container":case"component":case"external_component":h.x=t.x,h.y=t.y,h.fill=l,h.width=t.width,h.height=t.height,h.stroke=i,h.rx=2.5,h.ry=2.5,h.attrs={"stroke-width":.5},ie(n,h);break;case"system_db":case"external_system_db":case"container_db":case"external_container_db":case"component_db":case"external_component_db":n.append("path").attr("fill",l).attr("stroke-width","0.5").attr("stroke",i).attr("d","Mstartx,startyc0,-10 half,-10 half,-10c0,0 half,0 half,10l0,heightc0,10 -half,10 -half,10c0,0 -half,0 -half,-10l0,-height".replaceAll("startx",t.x).replaceAll("starty",t.y).replaceAll("half",t.width/2).replaceAll("height",t.height)),n.append("path").attr("fill","none").attr("stroke-width","0.5").attr("stroke",i).attr("d","Mstartx,startyc0,10 half,10 half,10c0,0 half,0 half,-10".replaceAll("startx",t.x).replaceAll("starty",t.y).replaceAll("half",t.width/2));break;case"system_queue":case"external_system_queue":case"container_queue":case"external_container_queue":case"component_queue":case"external_component_queue":n.append("path").attr("fill",l).attr("stroke-width","0.5").attr("stroke",i).attr("d","Mstartx,startylwidth,0c5,0 5,half 5,halfc0,0 0,half -5,halfl-width,0c-5,0 -5,-half -5,-halfc0,0 0,-half 5,-half".replaceAll("startx",t.x).replaceAll("starty",t.y).replaceAll("width",t.width).replaceAll("half",t.height/2)),n.append("path").attr("fill","none").attr("stroke-width","0.5").attr("stroke",i).attr("d","Mstartx,startyc-5,0 -5,half -5,halfc0,half 5,half 5,half".replaceAll("startx",t.x+t.width).replaceAll("starty",t.y).replaceAll("half",t.height/2));break}let f=v0(a,t.typeC4Shape.text);switch(n.append("text").attr("fill",s).attr("font-family",f.fontFamily).attr("font-size",f.fontSize-2).attr("font-style","italic").attr("lengthAdjust","spacing").attr("textLength",t.typeC4Shape.width).attr("x",t.x+t.width/2-t.typeC4Shape.width/2).attr("y",t.y+t.typeC4Shape.Y).text("<<"+t.typeC4Shape.text+">>"),t.typeC4Shape.text){case"person":case"external_person":ye(n,48,48,t.x+t.width/2-24,t.y+t.image.Y,r);break}let d=a[t.typeC4Shape.text+"Font"]();return d.fontWeight="bold",d.fontSize=d.fontSize+2,d.fontColor=s,W(a)(t.label.text,n,t.x,t.y+t.label.Y,t.width,t.height,{fill:s},d),d=a[t.typeC4Shape.text+"Font"](),d.fontColor=s,t.techn&&((o=t.techn)==null?void 0:o.text)!==""?W(a)(t.techn.text,n,t.x,t.y+t.techn.Y,t.width,t.height,{fill:s,"font-style":"italic"},d):t.type&&t.type.text!==""&&W(a)(t.type.text,n,t.x,t.y+t.type.Y,t.width,t.height,{fill:s,"font-style":"italic"},d),t.descr&&t.descr.text!==""&&(d=a.personFont(),d.fontColor=s,W(a)(t.descr.text,n,t.x,t.y+t.descr.Y,t.width,t.height,{fill:s},d)),t.height},p0=function(e){e.append("defs").append("symbol").attr("id","database").attr("fill-rule","evenodd").attr("clip-rule","evenodd").append("path").attr("transform","scale(.5)").attr("d","M12.258.001l.256.004.255.005.253.008.251.01.249.012.247.015.246.016.242.019.241.02.239.023.236.024.233.027.231.028.229.031.225.032.223.034.22.036.217.038.214.04.211.041.208.043.205.045.201.046.198.048.194.05.191.051.187.053.183.054.18.056.175.057.172.059.168.06.163.061.16.063.155.064.15.066.074.033.073.033.071.034.07.034.069.035.068.035.067.035.066.035.064.036.064.036.062.036.06.036.06.037.058.037.058.037.055.038.055.038.053.038.052.038.051.039.05.039.048.039.047.039.045.04.044.04.043.04.041.04.04.041.039.041.037.041.036.041.034.041.033.042.032.042.03.042.029.042.027.042.026.043.024.043.023.043.021.043.02.043.018.044.017.043.015.044.013.044.012.044.011.045.009.044.007.045.006.045.004.045.002.045.001.045v17l-.001.045-.002.045-.004.045-.006.045-.007.045-.009.044-.011.045-.012.044-.013.044-.015.044-.017.043-.018.044-.02.043-.021.043-.023.043-.024.043-.026.043-.027.042-.029.042-.03.042-.032.042-.033.042-.034.041-.036.041-.037.041-.039.041-.04.041-.041.04-.043.04-.044.04-.045.04-.047.039-.048.039-.05.039-.051.039-.052.038-.053.038-.055.038-.055.038-.058.037-.058.037-.06.037-.06.036-.062.036-.064.036-.064.036-.066.035-.067.035-.068.035-.069.035-.07.034-.071.034-.073.033-.074.033-.15.066-.155.064-.16.063-.163.061-.168.06-.172.059-.175.057-.18.056-.183.054-.187.053-.191.051-.194.05-.198.048-.201.046-.205.045-.208.043-.211.041-.214.04-.217.038-.22.036-.223.034-.225.032-.229.031-.231.028-.233.027-.236.024-.239.023-.241.02-.242.019-.246.016-.247.015-.249.012-.251.01-.253.008-.255.005-.256.004-.258.001-.258-.001-.256-.004-.255-.005-.253-.008-.251-.01-.249-.012-.247-.015-.245-.016-.243-.019-.241-.02-.238-.023-.236-.024-.234-.027-.231-.028-.228-.031-.226-.032-.223-.034-.22-.036-.217-.038-.214-.04-.211-.041-.208-.043-.204-.045-.201-.046-.198-.048-.195-.05-.19-.051-.187-.053-.184-.054-.179-.056-.176-.057-.172-.059-.167-.06-.164-.061-.159-.063-.155-.064-.151-.066-.074-.033-.072-.033-.072-.034-.07-.034-.069-.035-.068-.035-.067-.035-.066-.035-.064-.036-.063-.036-.062-.036-.061-.036-.06-.037-.058-.037-.057-.037-.056-.038-.055-.038-.053-.038-.052-.038-.051-.039-.049-.039-.049-.039-.046-.039-.046-.04-.044-.04-.043-.04-.041-.04-.04-.041-.039-.041-.037-.041-.036-.041-.034-.041-.033-.042-.032-.042-.03-.042-.029-.042-.027-.042-.026-.043-.024-.043-.023-.043-.021-.043-.02-.043-.018-.044-.017-.043-.015-.044-.013-.044-.012-.044-.011-.045-.009-.044-.007-.045-.006-.045-.004-.045-.002-.045-.001-.045v-17l.001-.045.002-.045.004-.045.006-.045.007-.045.009-.044.011-.045.012-.044.013-.044.015-.044.017-.043.018-.044.02-.043.021-.043.023-.043.024-.043.026-.043.027-.042.029-.042.03-.042.032-.042.033-.042.034-.041.036-.041.037-.041.039-.041.04-.041.041-.04.043-.04.044-.04.046-.04.046-.039.049-.039.049-.039.051-.039.052-.038.053-.038.055-.038.056-.038.057-.037.058-.037.06-.037.061-.036.062-.036.063-.036.064-.036.066-.035.067-.035.068-.035.069-.035.07-.034.072-.034.072-.033.074-.033.151-.066.155-.064.159-.063.164-.061.167-.06.172-.059.176-.057.179-.056.184-.054.187-.053.19-.051.195-.05.198-.048.201-.046.204-.045.208-.043.211-.041.214-.04.217-.038.22-.036.223-.034.226-.032.228-.031.231-.028.234-.027.236-.024.238-.023.241-.02.243-.019.245-.016.247-.015.249-.012.251-.01.253-.008.255-.005.256-.004.258-.001.258.001zm-9.258 20.499v.01l.001.021.003.021.004.022.005.021.006.022.007.022.009.023.01.022.011.023.012.023.013.023.015.023.016.024.017.023.018.024.019.024.021.024.022.025.023.024.024.025.052.049.056.05.061.051.066.051.07.051.075.051.079.052.084.052.088.052.092.052.097.052.102.051.105.052.11.052.114.051.119.051.123.051.127.05.131.05.135.05.139.048.144.049.147.047.152.047.155.047.16.045.163.045.167.043.171.043.176.041.178.041.183.039.187.039.19.037.194.035.197.035.202.033.204.031.209.03.212.029.216.027.219.025.222.024.226.021.23.02.233.018.236.016.24.015.243.012.246.01.249.008.253.005.256.004.259.001.26-.001.257-.004.254-.005.25-.008.247-.011.244-.012.241-.014.237-.016.233-.018.231-.021.226-.021.224-.024.22-.026.216-.027.212-.028.21-.031.205-.031.202-.034.198-.034.194-.036.191-.037.187-.039.183-.04.179-.04.175-.042.172-.043.168-.044.163-.045.16-.046.155-.046.152-.047.148-.048.143-.049.139-.049.136-.05.131-.05.126-.05.123-.051.118-.052.114-.051.11-.052.106-.052.101-.052.096-.052.092-.052.088-.053.083-.051.079-.052.074-.052.07-.051.065-.051.06-.051.056-.05.051-.05.023-.024.023-.025.021-.024.02-.024.019-.024.018-.024.017-.024.015-.023.014-.024.013-.023.012-.023.01-.023.01-.022.008-.022.006-.022.006-.022.004-.022.004-.021.001-.021.001-.021v-4.127l-.077.055-.08.053-.083.054-.085.053-.087.052-.09.052-.093.051-.095.05-.097.05-.1.049-.102.049-.105.048-.106.047-.109.047-.111.046-.114.045-.115.045-.118.044-.12.043-.122.042-.124.042-.126.041-.128.04-.13.04-.132.038-.134.038-.135.037-.138.037-.139.035-.142.035-.143.034-.144.033-.147.032-.148.031-.15.03-.151.03-.153.029-.154.027-.156.027-.158.026-.159.025-.161.024-.162.023-.163.022-.165.021-.166.02-.167.019-.169.018-.169.017-.171.016-.173.015-.173.014-.175.013-.175.012-.177.011-.178.01-.179.008-.179.008-.181.006-.182.005-.182.004-.184.003-.184.002h-.37l-.184-.002-.184-.003-.182-.004-.182-.005-.181-.006-.179-.008-.179-.008-.178-.01-.176-.011-.176-.012-.175-.013-.173-.014-.172-.015-.171-.016-.17-.017-.169-.018-.167-.019-.166-.02-.165-.021-.163-.022-.162-.023-.161-.024-.159-.025-.157-.026-.156-.027-.155-.027-.153-.029-.151-.03-.15-.03-.148-.031-.146-.032-.145-.033-.143-.034-.141-.035-.14-.035-.137-.037-.136-.037-.134-.038-.132-.038-.13-.04-.128-.04-.126-.041-.124-.042-.122-.042-.12-.044-.117-.043-.116-.045-.113-.045-.112-.046-.109-.047-.106-.047-.105-.048-.102-.049-.1-.049-.097-.05-.095-.05-.093-.052-.09-.051-.087-.052-.085-.053-.083-.054-.08-.054-.077-.054v4.127zm0-5.654v.011l.001.021.003.021.004.021.005.022.006.022.007.022.009.022.01.022.011.023.012.023.013.023.015.024.016.023.017.024.018.024.019.024.021.024.022.024.023.025.024.024.052.05.056.05.061.05.066.051.07.051.075.052.079.051.084.052.088.052.092.052.097.052.102.052.105.052.11.051.114.051.119.052.123.05.127.051.131.05.135.049.139.049.144.048.147.048.152.047.155.046.16.045.163.045.167.044.171.042.176.042.178.04.183.04.187.038.19.037.194.036.197.034.202.033.204.032.209.03.212.028.216.027.219.025.222.024.226.022.23.02.233.018.236.016.24.014.243.012.246.01.249.008.253.006.256.003.259.001.26-.001.257-.003.254-.006.25-.008.247-.01.244-.012.241-.015.237-.016.233-.018.231-.02.226-.022.224-.024.22-.025.216-.027.212-.029.21-.03.205-.032.202-.033.198-.035.194-.036.191-.037.187-.039.183-.039.179-.041.175-.042.172-.043.168-.044.163-.045.16-.045.155-.047.152-.047.148-.048.143-.048.139-.05.136-.049.131-.05.126-.051.123-.051.118-.051.114-.052.11-.052.106-.052.101-.052.096-.052.092-.052.088-.052.083-.052.079-.052.074-.051.07-.052.065-.051.06-.05.056-.051.051-.049.023-.025.023-.024.021-.025.02-.024.019-.024.018-.024.017-.024.015-.023.014-.023.013-.024.012-.022.01-.023.01-.023.008-.022.006-.022.006-.022.004-.021.004-.022.001-.021.001-.021v-4.139l-.077.054-.08.054-.083.054-.085.052-.087.053-.09.051-.093.051-.095.051-.097.05-.1.049-.102.049-.105.048-.106.047-.109.047-.111.046-.114.045-.115.044-.118.044-.12.044-.122.042-.124.042-.126.041-.128.04-.13.039-.132.039-.134.038-.135.037-.138.036-.139.036-.142.035-.143.033-.144.033-.147.033-.148.031-.15.03-.151.03-.153.028-.154.028-.156.027-.158.026-.159.025-.161.024-.162.023-.163.022-.165.021-.166.02-.167.019-.169.018-.169.017-.171.016-.173.015-.173.014-.175.013-.175.012-.177.011-.178.009-.179.009-.179.007-.181.007-.182.005-.182.004-.184.003-.184.002h-.37l-.184-.002-.184-.003-.182-.004-.182-.005-.181-.007-.179-.007-.179-.009-.178-.009-.176-.011-.176-.012-.175-.013-.173-.014-.172-.015-.171-.016-.17-.017-.169-.018-.167-.019-.166-.02-.165-.021-.163-.022-.162-.023-.161-.024-.159-.025-.157-.026-.156-.027-.155-.028-.153-.028-.151-.03-.15-.03-.148-.031-.146-.033-.145-.033-.143-.033-.141-.035-.14-.036-.137-.036-.136-.037-.134-.038-.132-.039-.13-.039-.128-.04-.126-.041-.124-.042-.122-.043-.12-.043-.117-.044-.116-.044-.113-.046-.112-.046-.109-.046-.106-.047-.105-.048-.102-.049-.1-.049-.097-.05-.095-.051-.093-.051-.09-.051-.087-.053-.085-.052-.083-.054-.08-.054-.077-.054v4.139zm0-5.666v.011l.001.02.003.022.004.021.005.022.006.021.007.022.009.023.01.022.011.023.012.023.013.023.015.023.016.024.017.024.018.023.019.024.021.025.022.024.023.024.024.025.052.05.056.05.061.05.066.051.07.051.075.052.079.051.084.052.088.052.092.052.097.052.102.052.105.051.11.052.114.051.119.051.123.051.127.05.131.05.135.05.139.049.144.048.147.048.152.047.155.046.16.045.163.045.167.043.171.043.176.042.178.04.183.04.187.038.19.037.194.036.197.034.202.033.204.032.209.03.212.028.216.027.219.025.222.024.226.021.23.02.233.018.236.017.24.014.243.012.246.01.249.008.253.006.256.003.259.001.26-.001.257-.003.254-.006.25-.008.247-.01.244-.013.241-.014.237-.016.233-.018.231-.02.226-.022.224-.024.22-.025.216-.027.212-.029.21-.03.205-.032.202-.033.198-.035.194-.036.191-.037.187-.039.183-.039.179-.041.175-.042.172-.043.168-.044.163-.045.16-.045.155-.047.152-.047.148-.048.143-.049.139-.049.136-.049.131-.051.126-.05.123-.051.118-.052.114-.051.11-.052.106-.052.101-.052.096-.052.092-.052.088-.052.083-.052.079-.052.074-.052.07-.051.065-.051.06-.051.056-.05.051-.049.023-.025.023-.025.021-.024.02-.024.019-.024.018-.024.017-.024.015-.023.014-.024.013-.023.012-.023.01-.022.01-.023.008-.022.006-.022.006-.022.004-.022.004-.021.001-.021.001-.021v-4.153l-.077.054-.08.054-.083.053-.085.053-.087.053-.09.051-.093.051-.095.051-.097.05-.1.049-.102.048-.105.048-.106.048-.109.046-.111.046-.114.046-.115.044-.118.044-.12.043-.122.043-.124.042-.126.041-.128.04-.13.039-.132.039-.134.038-.135.037-.138.036-.139.036-.142.034-.143.034-.144.033-.147.032-.148.032-.15.03-.151.03-.153.028-.154.028-.156.027-.158.026-.159.024-.161.024-.162.023-.163.023-.165.021-.166.02-.167.019-.169.018-.169.017-.171.016-.173.015-.173.014-.175.013-.175.012-.177.01-.178.01-.179.009-.179.007-.181.006-.182.006-.182.004-.184.003-.184.001-.185.001-.185-.001-.184-.001-.184-.003-.182-.004-.182-.006-.181-.006-.179-.007-.179-.009-.178-.01-.176-.01-.176-.012-.175-.013-.173-.014-.172-.015-.171-.016-.17-.017-.169-.018-.167-.019-.166-.02-.165-.021-.163-.023-.162-.023-.161-.024-.159-.024-.157-.026-.156-.027-.155-.028-.153-.028-.151-.03-.15-.03-.148-.032-.146-.032-.145-.033-.143-.034-.141-.034-.14-.036-.137-.036-.136-.037-.134-.038-.132-.039-.13-.039-.128-.041-.126-.041-.124-.041-.122-.043-.12-.043-.117-.044-.116-.044-.113-.046-.112-.046-.109-.046-.106-.048-.105-.048-.102-.048-.1-.05-.097-.049-.095-.051-.093-.051-.09-.052-.087-.052-.085-.053-.083-.053-.08-.054-.077-.054v4.153zm8.74-8.179l-.257.004-.254.005-.25.008-.247.011-.244.012-.241.014-.237.016-.233.018-.231.021-.226.022-.224.023-.22.026-.216.027-.212.028-.21.031-.205.032-.202.033-.198.034-.194.036-.191.038-.187.038-.183.04-.179.041-.175.042-.172.043-.168.043-.163.045-.16.046-.155.046-.152.048-.148.048-.143.048-.139.049-.136.05-.131.05-.126.051-.123.051-.118.051-.114.052-.11.052-.106.052-.101.052-.096.052-.092.052-.088.052-.083.052-.079.052-.074.051-.07.052-.065.051-.06.05-.056.05-.051.05-.023.025-.023.024-.021.024-.02.025-.019.024-.018.024-.017.023-.015.024-.014.023-.013.023-.012.023-.01.023-.01.022-.008.022-.006.023-.006.021-.004.022-.004.021-.001.021-.001.021.001.021.001.021.004.021.004.022.006.021.006.023.008.022.01.022.01.023.012.023.013.023.014.023.015.024.017.023.018.024.019.024.02.025.021.024.023.024.023.025.051.05.056.05.06.05.065.051.07.052.074.051.079.052.083.052.088.052.092.052.096.052.101.052.106.052.11.052.114.052.118.051.123.051.126.051.131.05.136.05.139.049.143.048.148.048.152.048.155.046.16.046.163.045.168.043.172.043.175.042.179.041.183.04.187.038.191.038.194.036.198.034.202.033.205.032.21.031.212.028.216.027.22.026.224.023.226.022.231.021.233.018.237.016.241.014.244.012.247.011.25.008.254.005.257.004.26.001.26-.001.257-.004.254-.005.25-.008.247-.011.244-.012.241-.014.237-.016.233-.018.231-.021.226-.022.224-.023.22-.026.216-.027.212-.028.21-.031.205-.032.202-.033.198-.034.194-.036.191-.038.187-.038.183-.04.179-.041.175-.042.172-.043.168-.043.163-.045.16-.046.155-.046.152-.048.148-.048.143-.048.139-.049.136-.05.131-.05.126-.051.123-.051.118-.051.114-.052.11-.052.106-.052.101-.052.096-.052.092-.052.088-.052.083-.052.079-.052.074-.051.07-.052.065-.051.06-.05.056-.05.051-.05.023-.025.023-.024.021-.024.02-.025.019-.024.018-.024.017-.023.015-.024.014-.023.013-.023.012-.023.01-.023.01-.022.008-.022.006-.023.006-.021.004-.022.004-.021.001-.021.001-.021-.001-.021-.001-.021-.004-.021-.004-.022-.006-.021-.006-.023-.008-.022-.01-.022-.01-.023-.012-.023-.013-.023-.014-.023-.015-.024-.017-.023-.018-.024-.019-.024-.02-.025-.021-.024-.023-.024-.023-.025-.051-.05-.056-.05-.06-.05-.065-.051-.07-.052-.074-.051-.079-.052-.083-.052-.088-.052-.092-.052-.096-.052-.101-.052-.106-.052-.11-.052-.114-.052-.118-.051-.123-.051-.126-.051-.131-.05-.136-.05-.139-.049-.143-.048-.148-.048-.152-.048-.155-.046-.16-.046-.163-.045-.168-.043-.172-.043-.175-.042-.179-.041-.183-.04-.187-.038-.191-.038-.194-.036-.198-.034-.202-.033-.205-.032-.21-.031-.212-.028-.216-.027-.22-.026-.224-.023-.226-.022-.231-.021-.233-.018-.237-.016-.241-.014-.244-.012-.247-.011-.25-.008-.254-.005-.257-.004-.26-.001-.26.001z")},y0=function(e){e.append("defs").append("symbol").attr("id","computer").attr("width","24").attr("height","24").append("path").attr("transform","scale(.5)").attr("d","M2 2v13h20v-13h-20zm18 11h-16v-9h16v9zm-10.228 6l.466-1h3.524l.467 1h-4.457zm14.228 3h-24l2-6h2.104l-1.33 4h18.45l-1.297-4h2.073l2 6zm-5-10h-14v-7h14v7z")},g0=function(e){e.append("defs").append("symbol").attr("id","clock").attr("width","24").attr("height","24").append("path").attr("transform","scale(.5)").attr("d","M12 2c5.514 0 10 4.486 10 10s-4.486 10-10 10-10-4.486-10-10 4.486-10 10-10zm0-2c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm5.848 12.459c.202.038.202.333.001.372-1.907.361-6.045 1.111-6.547 1.111-.719 0-1.301-.582-1.301-1.301 0-.512.77-5.447 1.125-7.445.034-.192.312-.181.343.014l.985 6.238 5.394 1.011z")},b0=function(e){e.append("defs").append("marker").attr("id","arrowhead").attr("refX",9).attr("refY",5).attr("markerUnits","userSpaceOnUse").attr("markerWidth",12).attr("markerHeight",12).attr("orient","auto").append("path").attr("d","M 0 0 L 10 5 L 0 10 z")},_0=function(e){e.append("defs").append("marker").attr("id","arrowend").attr("refX",1).attr("refY",5).attr("markerUnits","userSpaceOnUse").attr("markerWidth",12).attr("markerHeight",12).attr("orient","auto").append("path").attr("d","M 10 0 L 0 5 L 10 10 z")},x0=function(e){e.append("defs").append("marker").attr("id","filled-head").attr("refX",18).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L14,7 L9,1 Z")},m0=function(e){e.append("defs").append("marker").attr("id","sequencenumber").attr("refX",15).attr("refY",15).attr("markerWidth",60).attr("markerHeight",40).attr("orient","auto").append("circle").attr("cx",15).attr("cy",15).attr("r",6)},E0=function(e){const a=e.append("defs").append("marker").attr("id","crosshead").attr("markerWidth",15).attr("markerHeight",8).attr("orient","auto").attr("refX",16).attr("refY",4);a.append("path").attr("fill","black").attr("stroke","#000000").style("stroke-dasharray","0, 0").attr("stroke-width","1px").attr("d","M 9,2 V 6 L16,4 Z"),a.append("path").attr("fill","none").attr("stroke","#000000").style("stroke-dasharray","0, 0").attr("stroke-width","1px").attr("d","M 0,1 L 6,7 M 6,1 L 0,7")},v0=(e,t)=>({fontFamily:e[t+"FontFamily"],fontSize:e[t+"FontSize"],fontWeight:e[t+"FontWeight"]}),W=function(){function e(l,i,s,r,n,h,f){const d=i.append("text").attr("x",s+n/2).attr("y",r+h/2+5).style("text-anchor","middle").text(l);o(d,f)}function t(l,i,s,r,n,h,f,d){const{fontSize:p,fontFamily:E,fontWeight:O}=d,R=l.split(Kt.lineBreakRegex);for(let S=0;S<R.length;S++){const L=S*p-p*(R.length-1)/2,Y=i.append("text").attr("x",s+n/2).attr("y",r).style("text-anchor","middle").attr("dominant-baseline","middle").style("font-size",p).style("font-weight",O).style("font-family",E);Y.append("tspan").attr("dy",L).text(R[S]).attr("alignment-baseline","mathematical"),o(Y,f)}}function a(l,i,s,r,n,h,f,d){const p=i.append("switch"),O=p.append("foreignObject").attr("x",s).attr("y",r).attr("width",n).attr("height",h).append("xhtml:div").style("display","table").style("height","100%").style("width","100%");O.append("div").style("display","table-cell").style("text-align","center").style("vertical-align","middle").text(l),t(l,p,s,r,n,h,f,d),o(O,f)}function o(l,i){for(const s in i)i.hasOwnProperty(s)&&l.attr(s,i[s])}return function(l){return l.textPlacement==="fo"?a:l.textPlacement==="old"?e:t}}(),F={drawRect:ie,drawBoundary:d0,drawC4Shape:f0,drawRels:u0,drawImage:ye,insertArrowHead:b0,insertArrowEnd:_0,insertArrowFilledHead:x0,insertDynamicNumber:m0,insertArrowCrossHead:E0,insertDatabaseIcon:p0,insertComputerIcon:y0,insertClockIcon:g0};let Ut=0,Ft=0,ge=4,Zt=2;Yt.yy=Jt;let b={};class be{constructor(t){this.name="",this.data={},this.data.startx=void 0,this.data.stopx=void 0,this.data.starty=void 0,this.data.stopy=void 0,this.data.widthLimit=void 0,this.nextData={},this.nextData.startx=void 0,this.nextData.stopx=void 0,this.nextData.starty=void 0,this.nextData.stopy=void 0,this.nextData.cnt=0,$t(t.db.getConfig())}setData(t,a,o,l){this.nextData.startx=this.data.startx=t,this.nextData.stopx=this.data.stopx=a,this.nextData.starty=this.data.starty=o,this.nextData.stopy=this.data.stopy=l}updateVal(t,a,o,l){t[a]===void 0?t[a]=o:t[a]=l(o,t[a])}insert(t){this.nextData.cnt=this.nextData.cnt+1;let a=this.nextData.startx===this.nextData.stopx?this.nextData.stopx+t.margin:this.nextData.stopx+t.margin*2,o=a+t.width,l=this.nextData.starty+t.margin*2,i=l+t.height;(a>=this.data.widthLimit||o>=this.data.widthLimit||this.nextData.cnt>ge)&&(a=this.nextData.startx+t.margin+b.nextLinePaddingX,l=this.nextData.stopy+t.margin*2,this.nextData.stopx=o=a+t.width,this.nextData.starty=this.nextData.stopy,this.nextData.stopy=i=l+t.height,this.nextData.cnt=1),t.x=a,t.y=l,this.updateVal(this.data,"startx",a,Math.min),this.updateVal(this.data,"starty",l,Math.min),this.updateVal(this.data,"stopx",o,Math.max),this.updateVal(this.data,"stopy",i,Math.max),this.updateVal(this.nextData,"startx",a,Math.min),this.updateVal(this.nextData,"starty",l,Math.min),this.updateVal(this.nextData,"stopx",o,Math.max),this.updateVal(this.nextData,"stopy",i,Math.max)}init(t){this.name="",this.data={startx:void 0,stopx:void 0,starty:void 0,stopy:void 0,widthLimit:void 0},this.nextData={startx:void 0,stopx:void 0,starty:void 0,stopy:void 0,cnt:0},$t(t.db.getConfig())}bumpLastMargin(t){this.data.stopx+=t,this.data.stopy+=t}}const $t=function(e){Se(b,e),e.fontFamily&&(b.personFontFamily=b.systemFontFamily=b.messageFontFamily=e.fontFamily),e.fontSize&&(b.personFontSize=b.systemFontSize=b.messageFontSize=e.fontSize),e.fontWeight&&(b.personFontWeight=b.systemFontWeight=b.messageFontWeight=e.fontWeight)},Rt=(e,t)=>({fontFamily:e[t+"FontFamily"],fontSize:e[t+"FontSize"],fontWeight:e[t+"FontWeight"]}),Bt=e=>({fontFamily:e.boundaryFontFamily,fontSize:e.boundaryFontSize,fontWeight:e.boundaryFontWeight}),k0=e=>({fontFamily:e.messageFontFamily,fontSize:e.messageFontSize,fontWeight:e.messageFontWeight});function I(e,t,a,o,l){if(!t[e].width)if(a)t[e].text=Me(t[e].text,l,o),t[e].textLines=t[e].text.split(Kt.lineBreakRegex).length,t[e].width=l,t[e].height=oe(t[e].text,o);else{let i=t[e].text.split(Kt.lineBreakRegex);t[e].textLines=i.length;let s=0;t[e].height=0,t[e].width=0;for(const r of i)t[e].width=Math.max(wt(r,o),t[e].width),s=oe(r,o),t[e].height=t[e].height+s}}const _e=function(e,t,a){t.x=a.data.startx,t.y=a.data.starty,t.width=a.data.stopx-a.data.startx,t.height=a.data.stopy-a.data.starty,t.label.y=b.c4ShapeMargin-35;let o=t.wrap&&b.wrap,l=Bt(b);l.fontSize=l.fontSize+2,l.fontWeight="bold";let i=wt(t.label.text,l);I("label",t,o,l,i),F.drawBoundary(e,t,b)},xe=function(e,t,a,o){let l=0;for(const i of o){l=0;const s=a[i];let r=Rt(b,s.typeC4Shape.text);switch(r.fontSize=r.fontSize-2,s.typeC4Shape.width=wt("«"+s.typeC4Shape.text+"»",r),s.typeC4Shape.height=r.fontSize+2,s.typeC4Shape.Y=b.c4ShapePadding,l=s.typeC4Shape.Y+s.typeC4Shape.height-4,s.image={width:0,height:0,Y:0},s.typeC4Shape.text){case"person":case"external_person":s.image.width=48,s.image.height=48,s.image.Y=l,l=s.image.Y+s.image.height;break}s.sprite&&(s.image.width=48,s.image.height=48,s.image.Y=l,l=s.image.Y+s.image.height);let n=s.wrap&&b.wrap,h=b.width-b.c4ShapePadding*2,f=Rt(b,s.typeC4Shape.text);if(f.fontSize=f.fontSize+2,f.fontWeight="bold",I("label",s,n,f,h),s.label.Y=l+8,l=s.label.Y+s.label.height,s.type&&s.type.text!==""){s.type.text="["+s.type.text+"]";let E=Rt(b,s.typeC4Shape.text);I("type",s,n,E,h),s.type.Y=l+5,l=s.type.Y+s.type.height}else if(s.techn&&s.techn.text!==""){s.techn.text="["+s.techn.text+"]";let E=Rt(b,s.techn.text);I("techn",s,n,E,h),s.techn.Y=l+5,l=s.techn.Y+s.techn.height}let d=l,p=s.label.width;if(s.descr&&s.descr.text!==""){let E=Rt(b,s.typeC4Shape.text);I("descr",s,n,E,h),s.descr.Y=l+20,l=s.descr.Y+s.descr.height,p=Math.max(s.label.width,s.descr.width),d=l-s.descr.textLines*5}p=p+b.c4ShapePadding,s.width=Math.max(s.width||b.width,p,b.width),s.height=Math.max(s.height||b.height,d,b.height),s.margin=s.margin||b.c4ShapeMargin,e.insert(s),F.drawC4Shape(t,s,b)}e.bumpLastMargin(b.c4ShapeMargin)};class B{constructor(t,a){this.x=t,this.y=a}}let ce=function(e,t){let a=e.x,o=e.y,l=t.x,i=t.y,s=a+e.width/2,r=o+e.height/2,n=Math.abs(a-l),h=Math.abs(o-i),f=h/n,d=e.height/e.width,p=null;return o==i&&a<l?p=new B(a+e.width,r):o==i&&a>l?p=new B(a,r):a==l&&o<i?p=new B(s,o+e.height):a==l&&o>i&&(p=new B(s,o)),a>l&&o<i?d>=f?p=new B(a,r+f*e.width/2):p=new B(s-n/h*e.height/2,o+e.height):a<l&&o<i?d>=f?p=new B(a+e.width,r+f*e.width/2):p=new B(s+n/h*e.height/2,o+e.height):a<l&&o>i?d>=f?p=new B(a+e.width,r-f*e.width/2):p=new B(s+e.height/2*n/h,o):a>l&&o>i&&(d>=f?p=new B(a,r-e.width/2*f):p=new B(s-e.height/2*n/h,o)),p},A0=function(e,t){let a={x:0,y:0};a.x=t.x+t.width/2,a.y=t.y+t.height/2;let o=ce(e,a);a.x=e.x+e.width/2,a.y=e.y+e.height/2;let l=ce(t,a);return{startPoint:o,endPoint:l}};const C0=function(e,t,a,o){let l=0;for(let i of t){l=l+1;let s=i.wrap&&b.wrap,r=k0(b);o.db.getC4Type()==="C4Dynamic"&&(i.label.text=l+": "+i.label.text);let h=wt(i.label.text,r);I("label",i,s,r,h),i.techn&&i.techn.text!==""&&(h=wt(i.techn.text,r),I("techn",i,s,r,h)),i.descr&&i.descr.text!==""&&(h=wt(i.descr.text,r),I("descr",i,s,r,h));let f=a(i.from),d=a(i.to),p=A0(f,d);i.startPoint=p.startPoint,i.endPoint=p.endPoint}F.drawRels(e,t,b)};function me(e,t,a,o,l){let i=new be(l);i.data.widthLimit=a.data.widthLimit/Math.min(Zt,o.length);for(let[s,r]of o.entries()){let n=0;r.image={width:0,height:0,Y:0},r.sprite&&(r.image.width=48,r.image.height=48,r.image.Y=n,n=r.image.Y+r.image.height);let h=r.wrap&&b.wrap,f=Bt(b);if(f.fontSize=f.fontSize+2,f.fontWeight="bold",I("label",r,h,f,i.data.widthLimit),r.label.Y=n+8,n=r.label.Y+r.label.height,r.type&&r.type.text!==""){r.type.text="["+r.type.text+"]";let O=Bt(b);I("type",r,h,O,i.data.widthLimit),r.type.Y=n+5,n=r.type.Y+r.type.height}if(r.descr&&r.descr.text!==""){let O=Bt(b);O.fontSize=O.fontSize-2,I("descr",r,h,O,i.data.widthLimit),r.descr.Y=n+20,n=r.descr.Y+r.descr.height}if(s==0||s%Zt===0){let O=a.data.startx+b.diagramMarginX,R=a.data.stopy+b.diagramMarginY+n;i.setData(O,O,R,R)}else{let O=i.data.stopx!==i.data.startx?i.data.stopx+b.diagramMarginX:i.data.startx,R=i.data.starty;i.setData(O,O,R,R)}i.name=r.alias;let d=l.db.getC4ShapeArray(r.alias),p=l.db.getC4ShapeKeys(r.alias);p.length>0&&xe(i,e,d,p),t=r.alias;let E=l.db.getBoundarys(t);E.length>0&&me(e,t,i,E,l),r.alias!=="global"&&_e(e,r,i),a.data.stopy=Math.max(i.data.stopy+b.c4ShapeMargin,a.data.stopy),a.data.stopx=Math.max(i.data.stopx+b.c4ShapeMargin,a.data.stopx),Ut=Math.max(Ut,a.data.stopx),Ft=Math.max(Ft,a.data.stopy)}}const w0=function(e,t,a,o){b=Dt().c4;const l=Dt().securityLevel;let i;l==="sandbox"&&(i=Nt("#i"+t));const s=l==="sandbox"?Nt(i.nodes()[0].contentDocument.body):Nt("body");let r=o.db;o.db.setWrap(b.wrap),ge=r.getC4ShapeInRow(),Zt=r.getC4BoundaryInRow(),le.debug(`C:${JSON.stringify(b,null,2)}`);const n=l==="sandbox"?s.select(`[id="${t}"]`):Nt(`[id="${t}"]`);F.insertComputerIcon(n),F.insertDatabaseIcon(n),F.insertClockIcon(n);let h=new be(o);h.setData(b.diagramMarginX,b.diagramMarginX,b.diagramMarginY,b.diagramMarginY),h.data.widthLimit=screen.availWidth,Ut=b.diagramMarginX,Ft=b.diagramMarginY;const f=o.db.getTitle();let d=o.db.getBoundarys("");me(n,"",h,d,o),F.insertArrowHead(n),F.insertArrowEnd(n),F.insertArrowCrossHead(n),F.insertArrowFilledHead(n),C0(n,o.db.getRels(),o.db.getC4Shape,o),h.data.stopx=Ut,h.data.stopy=Ft;const p=h.data;let O=p.stopy-p.starty+2*b.diagramMarginY;const S=p.stopx-p.startx+2*b.diagramMarginX;f&&n.append("text").text(f).attr("x",(p.stopx-p.startx)/2-4*b.diagramMarginX).attr("y",p.starty+b.diagramMarginY),De(n,O,S,b.useMaxWidth);const L=f?60:0;n.attr("viewBox",p.startx-b.diagramMarginX+" -"+(b.diagramMarginY+L)+" "+S+" "+(O+L)),le.debug("models:",p)},he={drawPersonOrSystemArray:xe,drawBoundary:_e,setConf:$t,draw:w0},O0=e=>`.person {
|
| 7 |
+
stroke: ${e.personBorder};
|
| 8 |
+
fill: ${e.personBkg};
|
| 9 |
+
}
|
| 10 |
+
`,T0=O0,S0={parser:Be,db:Jt,renderer:he,styles:T0,init:({c4:e,wrap:t})=>{he.setConf(e),Jt.setWrap(t)}};export{S0 as diagram};
|
frontend-dist/assets/channel-DsKT-zfZ.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
import{aH as o,aI as n}from"./index-BCNM9-Ly.js";const t=(a,r)=>o.lang.round(n.parse(a)[r]);export{t as c};
|
frontend-dist/assets/classDiagram-beda092f-wmkRqnN2.js
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import{s as A,d as S,p as G}from"./styles-b4e223ce-CtHeUc7h.js";import{c as v,l as y,d as B,e as W,F as $,A as M,G as I}from"./index-BCNM9-Ly.js";import{G as O}from"./graph-CY8eBbAS.js";import{l as P}from"./layout-CUwpW5wl.js";import{l as X}from"./line-DdWeXrJe.js";import"./array-BKyUJesY.js";import"./path-CbwjOpE9.js";let H=0;const Y=function(i,a,t,o,p){const g=function(e){switch(e){case p.db.relationType.AGGREGATION:return"aggregation";case p.db.relationType.EXTENSION:return"extension";case p.db.relationType.COMPOSITION:return"composition";case p.db.relationType.DEPENDENCY:return"dependency";case p.db.relationType.LOLLIPOP:return"lollipop"}};a.points=a.points.filter(e=>!Number.isNaN(e.y));const s=a.points,c=X().x(function(e){return e.x}).y(function(e){return e.y}).curve($),n=i.append("path").attr("d",c(s)).attr("id","edge"+H).attr("class","relation");let r="";o.arrowMarkerAbsolute&&(r=window.location.protocol+"//"+window.location.host+window.location.pathname+window.location.search,r=r.replace(/\(/g,"\\("),r=r.replace(/\)/g,"\\)")),t.relation.lineType==1&&n.attr("class","relation dashed-line"),t.relation.lineType==10&&n.attr("class","relation dotted-line"),t.relation.type1!=="none"&&n.attr("marker-start","url("+r+"#"+g(t.relation.type1)+"Start)"),t.relation.type2!=="none"&&n.attr("marker-end","url("+r+"#"+g(t.relation.type2)+"End)");let f,h;const x=a.points.length;let b=M.calcLabelPosition(a.points);f=b.x,h=b.y;let u,m,w,k;if(x%2!==0&&x>1){let e=M.calcCardinalityPosition(t.relation.type1!=="none",a.points,a.points[0]),d=M.calcCardinalityPosition(t.relation.type2!=="none",a.points,a.points[x-1]);y.debug("cardinality_1_point "+JSON.stringify(e)),y.debug("cardinality_2_point "+JSON.stringify(d)),u=e.x,m=e.y,w=d.x,k=d.y}if(t.title!==void 0){const e=i.append("g").attr("class","classLabel"),d=e.append("text").attr("class","label").attr("x",f).attr("y",h).attr("fill","red").attr("text-anchor","middle").text(t.title);window.label=d;const l=d.node().getBBox();e.insert("rect",":first-child").attr("class","box").attr("x",l.x-o.padding/2).attr("y",l.y-o.padding/2).attr("width",l.width+o.padding).attr("height",l.height+o.padding)}y.info("Rendering relation "+JSON.stringify(t)),t.relationTitle1!==void 0&&t.relationTitle1!=="none"&&i.append("g").attr("class","cardinality").append("text").attr("class","type1").attr("x",u).attr("y",m).attr("fill","black").attr("font-size","6").text(t.relationTitle1),t.relationTitle2!==void 0&&t.relationTitle2!=="none"&&i.append("g").attr("class","cardinality").append("text").attr("class","type2").attr("x",w).attr("y",k).attr("fill","black").attr("font-size","6").text(t.relationTitle2),H++},J=function(i,a,t,o){y.debug("Rendering class ",a,t);const p=a.id,g={id:p,label:a.id,width:0,height:0},s=i.append("g").attr("id",o.db.lookUpDomId(p)).attr("class","classGroup");let c;a.link?c=s.append("svg:a").attr("xlink:href",a.link).attr("target",a.linkTarget).append("text").attr("y",t.textHeight+t.padding).attr("x",0):c=s.append("text").attr("y",t.textHeight+t.padding).attr("x",0);let n=!0;a.annotations.forEach(function(d){const l=c.append("tspan").text("«"+d+"»");n||l.attr("dy",t.textHeight),n=!1});let r=C(a);const f=c.append("tspan").text(r).attr("class","title");n||f.attr("dy",t.textHeight);const h=c.node().getBBox().height;let x,b,u;if(a.members.length>0){x=s.append("line").attr("x1",0).attr("y1",t.padding+h+t.dividerMargin/2).attr("y2",t.padding+h+t.dividerMargin/2);const d=s.append("text").attr("x",t.padding).attr("y",h+t.dividerMargin+t.textHeight).attr("fill","white").attr("class","classText");n=!0,a.members.forEach(function(l){_(d,l,n,t),n=!1}),b=d.node().getBBox()}if(a.methods.length>0){u=s.append("line").attr("x1",0).attr("y1",t.padding+h+t.dividerMargin+b.height).attr("y2",t.padding+h+t.dividerMargin+b.height);const d=s.append("text").attr("x",t.padding).attr("y",h+2*t.dividerMargin+b.height+t.textHeight).attr("fill","white").attr("class","classText");n=!0,a.methods.forEach(function(l){_(d,l,n,t),n=!1})}const m=s.node().getBBox();var w=" ";a.cssClasses.length>0&&(w=w+a.cssClasses.join(" "));const e=s.insert("rect",":first-child").attr("x",0).attr("y",0).attr("width",m.width+2*t.padding).attr("height",m.height+t.padding+.5*t.dividerMargin).attr("class",w).node().getBBox().width;return c.node().childNodes.forEach(function(d){d.setAttribute("x",(e-d.getBBox().width)/2)}),a.tooltip&&c.insert("title").text(a.tooltip),x&&x.attr("x2",e),u&&u.attr("x2",e),g.width=e,g.height=m.height+t.padding+.5*t.dividerMargin,g},C=function(i){let a=i.id;return i.type&&(a+="<"+I(i.type)+">"),a},Z=function(i,a,t,o){y.debug("Rendering note ",a,t);const p=a.id,g={id:p,text:a.text,width:0,height:0},s=i.append("g").attr("id",p).attr("class","classGroup");let c=s.append("text").attr("y",t.textHeight+t.padding).attr("x",0);const n=JSON.parse(`"${a.text}"`).split(`
|
| 2 |
+
`);n.forEach(function(x){y.debug(`Adding line: ${x}`),c.append("tspan").text(x).attr("class","title").attr("dy",t.textHeight)});const r=s.node().getBBox(),h=s.insert("rect",":first-child").attr("x",0).attr("y",0).attr("width",r.width+2*t.padding).attr("height",r.height+n.length*t.textHeight+t.padding+.5*t.dividerMargin).node().getBBox().width;return c.node().childNodes.forEach(function(x){x.setAttribute("x",(h-x.getBBox().width)/2)}),g.width=h,g.height=r.height+n.length*t.textHeight+t.padding+.5*t.dividerMargin,g},_=function(i,a,t,o){const{displayText:p,cssStyle:g}=a.getDisplayDetails(),s=i.append("tspan").attr("x",o.padding).text(p);g!==""&&s.attr("style",a.cssStyle),t||s.attr("dy",o.textHeight)},N={getClassTitleString:C,drawClass:J,drawEdge:Y,drawNote:Z};let T={};const E=20,L=function(i){const a=Object.entries(T).find(t=>t[1].label===i);if(a)return a[0]},R=function(i){i.append("defs").append("marker").attr("id","extensionStart").attr("class","extension").attr("refX",0).attr("refY",7).attr("markerWidth",190).attr("markerHeight",240).attr("orient","auto").append("path").attr("d","M 1,7 L18,13 V 1 Z"),i.append("defs").append("marker").attr("id","extensionEnd").attr("refX",19).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 1,1 V 13 L18,7 Z"),i.append("defs").append("marker").attr("id","compositionStart").attr("class","extension").attr("refX",0).attr("refY",7).attr("markerWidth",190).attr("markerHeight",240).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","compositionEnd").attr("refX",19).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","aggregationStart").attr("class","extension").attr("refX",0).attr("refY",7).attr("markerWidth",190).attr("markerHeight",240).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","aggregationEnd").attr("refX",19).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","dependencyStart").attr("class","extension").attr("refX",0).attr("refY",7).attr("markerWidth",190).attr("markerHeight",240).attr("orient","auto").append("path").attr("d","M 5,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","dependencyEnd").attr("refX",19).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L14,7 L9,1 Z")},F=function(i,a,t,o){const p=v().class;T={},y.info("Rendering diagram "+i);const g=v().securityLevel;let s;g==="sandbox"&&(s=B("#i"+a));const c=g==="sandbox"?B(s.nodes()[0].contentDocument.body):B("body"),n=c.select(`[id='${a}']`);R(n);const r=new O({multigraph:!0});r.setGraph({isMultiGraph:!0}),r.setDefaultEdgeLabel(function(){return{}});const f=o.db.getClasses(),h=Object.keys(f);for(const e of h){const d=f[e],l=N.drawClass(n,d,p,o);T[l.id]=l,r.setNode(l.id,l),y.info("Org height: "+l.height)}o.db.getRelations().forEach(function(e){y.info("tjoho"+L(e.id1)+L(e.id2)+JSON.stringify(e)),r.setEdge(L(e.id1),L(e.id2),{relation:e},e.title||"DEFAULT")}),o.db.getNotes().forEach(function(e){y.debug(`Adding note: ${JSON.stringify(e)}`);const d=N.drawNote(n,e,p,o);T[d.id]=d,r.setNode(d.id,d),e.class&&e.class in f&&r.setEdge(e.id,L(e.class),{relation:{id1:e.id,id2:e.class,relation:{type1:"none",type2:"none",lineType:10}}},"DEFAULT")}),P(r),r.nodes().forEach(function(e){e!==void 0&&r.node(e)!==void 0&&(y.debug("Node "+e+": "+JSON.stringify(r.node(e))),c.select("#"+(o.db.lookUpDomId(e)||e)).attr("transform","translate("+(r.node(e).x-r.node(e).width/2)+","+(r.node(e).y-r.node(e).height/2)+" )"))}),r.edges().forEach(function(e){e!==void 0&&r.edge(e)!==void 0&&(y.debug("Edge "+e.v+" -> "+e.w+": "+JSON.stringify(r.edge(e))),N.drawEdge(n,r.edge(e),r.edge(e).relation,p,o))});const u=n.node().getBBox(),m=u.width+E*2,w=u.height+E*2;W(n,w,m,p.useMaxWidth);const k=`${u.x-E} ${u.y-E} ${m} ${w}`;y.debug(`viewBox ${k}`),n.attr("viewBox",k)},U={draw:F},tt={parser:G,db:S,renderer:U,styles:A,init:i=>{i.class||(i.class={}),i.class.arrowMarkerAbsolute=i.arrowMarkerAbsolute,S.clear()}};export{tt as diagram};
|