GitHub Actions Bot commited on
Commit
1ea875f
·
0 Parent(s):

deploy: auto-inject hf config & sync

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +93 -0
  2. .github/workflows/sync_to_hub.yml +58 -0
  3. .gitignore +43 -0
  4. Dockerfile +45 -0
  5. LICENSE +21 -0
  6. README.md +224 -0
  7. README_zh.md +212 -0
  8. app/core/config.py +246 -0
  9. app/main.py +560 -0
  10. app/services/agent_service.py +779 -0
  11. app/services/auto_evaluation_service.py +481 -0
  12. app/services/chat_service.py +601 -0
  13. app/services/chunking_service.py +372 -0
  14. app/services/github_service.py +210 -0
  15. app/services/tracing_service.py +549 -0
  16. app/services/vector_service.py +676 -0
  17. app/storage/__init__.py +34 -0
  18. app/storage/base.py +159 -0
  19. app/storage/qdrant_store.py +578 -0
  20. app/utils/embedding.py +254 -0
  21. app/utils/github_client.py +478 -0
  22. app/utils/llm_client.py +108 -0
  23. app/utils/llm_providers/__init__.py +29 -0
  24. app/utils/llm_providers/anthropic_provider.py +196 -0
  25. app/utils/llm_providers/base.py +320 -0
  26. app/utils/llm_providers/deepseek_provider.py +154 -0
  27. app/utils/llm_providers/factory.py +171 -0
  28. app/utils/llm_providers/gemini_provider.py +301 -0
  29. app/utils/llm_providers/openai_provider.py +145 -0
  30. app/utils/repo_lock.py +390 -0
  31. app/utils/retry.py +198 -0
  32. app/utils/session.py +230 -0
  33. deploy.sh +143 -0
  34. docker-compose.yml +102 -0
  35. evaluation/__init__.py +64 -0
  36. evaluation/analyze_eval_results.py +379 -0
  37. evaluation/clean_and_export_sft_data.py +369 -0
  38. evaluation/data_router.py +222 -0
  39. evaluation/evaluation_framework.py +512 -0
  40. evaluation/golden_dataset_builder.py +414 -0
  41. evaluation/models.py +244 -0
  42. evaluation/test_retrieval.py +330 -0
  43. evaluation/utils.py +196 -0
  44. frontend-dist/assets/Tableau10-B-NsZVaP.js +1 -0
  45. frontend-dist/assets/arc-BscbqCCW.js +1 -0
  46. frontend-dist/assets/array-BKyUJesY.js +1 -0
  47. frontend-dist/assets/blockDiagram-c4efeb88-CL85BYG9.js +118 -0
  48. frontend-dist/assets/c4Diagram-c83219d4-Dwk4T9_E.js +10 -0
  49. frontend-dist/assets/channel-DsKT-zfZ.js +1 -0
  50. frontend-dist/assets/classDiagram-beda092f-wmkRqnN2.js +2 -0
.env.example ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ======================================
2
+ # GitHub Agent Demo - 环境变量配置
3
+ # ======================================
4
+
5
+ # --- LLM 供应商选择 ---
6
+ # 支持: openai, deepseek, anthropic, gemini
7
+ # 默认: deepseek
8
+ LLM_PROVIDER=deepseek
9
+
10
+ # --- API Keys (根据选择的供应商配置对应的 Key) ---
11
+
12
+ # OpenAI (如果 LLM_PROVIDER=openai)
13
+ OPENAI_API_KEY=
14
+ # OPENAI_BASE_URL= # 可选: 自定义端点 (如 Azure OpenAI)
15
+
16
+ # DeepSeek (如果 LLM_PROVIDER=deepseek)
17
+ DEEPSEEK_API_KEY=
18
+ # DEEPSEEK_BASE_URL=https://api.deepseek.com # 可选: 默认值
19
+
20
+ # Anthropic Claude (如果 LLM_PROVIDER=anthropic)
21
+ ANTHROPIC_API_KEY=
22
+
23
+ # Google Gemini (如果 LLM_PROVIDER=gemini)
24
+ GEMINI_API_KEY=
25
+ # GEMINI_BASE_URL= # 可选: OpenAI 兼容端点
26
+
27
+ # --- 模型配置 ---
28
+ # 如果不指定,将使用各供应商的默认模型:
29
+ # - openai: gpt-4o-mini
30
+ # - deepseek: deepseek-chat
31
+ # - anthropic: claude-3-5-sonnet-20241022
32
+ # - gemini: gemini-1.5-flash
33
+ # MODEL_NAME=deepseek-chat
34
+
35
+ # --- GitHub Token ---
36
+ # 用于访问 GitHub API,提高请求限制
37
+ GITHUB_TOKEN=
38
+
39
+ # --- Embedding 服务 ---
40
+ # SiliconFlow API Key (用于 BGE-M3 Embedding)
41
+ SILICON_API_KEY=
42
+
43
+ # --- Langfuse 追踪配置 (可选) ---
44
+ # LANGFUSE_ENABLED=true
45
+ # LANGFUSE_HOST=http://localhost:3000
46
+ # LANGFUSE_PUBLIC_KEY=
47
+ # LANGFUSE_SECRET_KEY=
48
+
49
+ # --- Qdrant 向量数据库配置 ---
50
+ # 模式选择: "local" | "server" | "cloud"
51
+ # - local: 本地嵌入式存储 (开发环境, 单 Worker)
52
+ # - server: Qdrant Server Docker (生产环境, 多 Worker)
53
+ # - cloud: Qdrant Cloud 托管服务
54
+ QDRANT_MODE=local
55
+ QDRANT_LOCAL_PATH=data/qdrant_db
56
+
57
+ # Server 模式: 连接 Qdrant Server (Docker)
58
+ # QDRANT_MODE=server
59
+ # QDRANT_URL=http://localhost:6333
60
+ # 或分开配置:
61
+ # QDRANT_HOST=localhost
62
+ # QDRANT_PORT=6333
63
+
64
+ # Cloud 模式: 连接 Qdrant Cloud
65
+ # QDRANT_MODE=cloud
66
+ # QDRANT_URL=https://xxx.qdrant.tech
67
+ # QDRANT_API_KEY=your-api-key
68
+
69
+ # 向量维度 (BGE-M3 = 1024)
70
+ # QDRANT_VECTOR_SIZE=1024
71
+
72
+ # --- Gunicorn Worker 配置 ---
73
+ # 2核2G服务器建议设为 2
74
+ # 4核8G服务器可设为 4
75
+ GUNICORN_WORKERS=2
76
+
77
+ # --- 分布式锁配置 ---
78
+ # 锁后端: "memory" | "file" | "redis"
79
+ # - memory: 内存锁 (单进程)
80
+ # - file: 文件锁 (多 Worker 单节点)
81
+ # - redis: Redis 分布式锁 (多节点)
82
+ LOCK_BACKEND=file
83
+ LOCK_DIR=data/locks
84
+ # REDIS_URL=redis://localhost:6379/0
85
+
86
+ # --- 服务配置 ---
87
+ HOST=0.0.0.0
88
+ PORT=8000
89
+
90
+ # --- LLM 参数 (可选) ---
91
+ # LLM_TEMPERATURE=0.1
92
+ # LLM_MAX_TOKENS=4096
93
+ # LLM_TIMEOUT=600
.github/workflows/sync_to_hub.yml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+ workflow_dispatch:
6
+
7
+ jobs:
8
+ sync-to-hub:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v3
12
+ with:
13
+ fetch-depth: 0
14
+
15
+ - name: Push to hub
16
+ env:
17
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
18
+ HF_USERNAME: realdexter
19
+ SPACE_NAME: RepoReaper
20
+ run: |
21
+ echo "🚀 Starting deployment to Hugging Face..."
22
+
23
+ # 1. 配置 Git
24
+ git config --global user.email "bot@github.com"
25
+ git config --global user.name "GitHub Actions Bot"
26
+
27
+ # 2. 【核心魔法】动态生成 Hugging Face 专用的 README
28
+ # 这一步会在发送给 HF 之前,强行在 README.md 顶部插入配置头
29
+ # GitHub 本地的文件不会受影响,依然保持干净漂亮
30
+ echo "---" > hf_header.yml
31
+ echo "title: RepoReaper" >> hf_header.yml
32
+ echo "emoji: 💀" >> hf_header.yml
33
+ echo "colorFrom: blue" >> hf_header.yml
34
+ echo "colorTo: indigo" >> hf_header.yml
35
+ echo "sdk: docker" >> hf_header.yml
36
+ echo "pinned: false" >> hf_header.yml
37
+ echo "app_port: 8000" >> hf_header.yml # 👈 关键:这里指定端口,你就不用改代码了
38
+ echo "---" >> hf_header.yml
39
+ echo "" >> hf_header.yml
40
+
41
+ # 将配置头和原 README 内容拼接
42
+ cat hf_header.yml README.md > README_temp.md
43
+ mv README_temp.md README.md
44
+
45
+ # 3. 清理不需要的文件
46
+ rm -rf docs/
47
+ rm -f *.jpg *.png *.gif hf_header.yml
48
+ rm -rf .git
49
+
50
+ # 4. 初始化新仓库并推送
51
+ git init -b main
52
+ git add .
53
+ git commit -m "deploy: auto-inject hf config & sync"
54
+
55
+ git remote add space https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME
56
+ git push --force space main
57
+
58
+ echo "✅ Deployment successful! Config header injected on-the-fly."
.gitignore ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .gitignore
2
+ __pycache__/
3
+ *.py[cod]
4
+ .env
5
+ .venv/
6
+ venv/
7
+ .DS_Store
8
+ data/
9
+ # Vue 构建输出
10
+ #frontend-dist/
11
+ frontend-vue/node_modules/
12
+ frontend-vue/dist/
13
+
14
+ # 锁文件目录
15
+ data/locks/
16
+
17
+ # 日志
18
+ logs/
19
+ *.log
20
+
21
+ # IDE
22
+ .idea/
23
+ .vscode/
24
+ *.swp
25
+
26
+ # 临时文件
27
+ *.tmp
28
+ *.bak
29
+ QUICKSTART.md
30
+ docs/INTERVIEW_QA.md
31
+ docs/ROADMAP.md
32
+ docs/TECHNICAL_REPORT.md
33
+ evaluation/000_START_HERE.md
34
+ evaluation/golden_dataset.json
35
+ evaluation/HIGH_QUALITY_QUESTIONS.md
36
+
37
+ evaluation/README_EVALUATION_SYSTEM.md
38
+ evaluation/ragas_eval_dataset.json
39
+ evaluation/sft_data/eval_results.jsonl
40
+ evaluation/sft_data/negative_samples.jsonl
41
+ evaluation/sft_data/positive_samples.jsonl
42
+ evaluation/sft_data/skipped_samples.jsonl
43
+ evaluation/sft_data/cleaned/rejected_20260128_010745.jsonl
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. 基础镜像:选择 Python 3.10 的轻量版 (Slim)
2
+ FROM python:3.10-slim
3
+
4
+ # 2. 设置环境变量
5
+ ENV PYTHONDONTWRITEBYTECODE=1 \
6
+ PYTHONUNBUFFERED=1 \
7
+ # 默认 LLM 供应商 (可通过 docker run -e 覆盖)
8
+ LLM_PROVIDER=deepseek
9
+
10
+ # 3. 设置工作目录
11
+ WORKDIR /app
12
+
13
+ # 4. 安装系统级依赖
14
+ # build-essential: ChromaDB 编译需要
15
+ # curl: 健康检查
16
+ # git: 某些 pip 包可能需要
17
+ RUN apt-get update && apt-get install -y --no-install-recommends \
18
+ build-essential \
19
+ curl \
20
+ git \
21
+ && rm -rf /var/lib/apt/lists/* \
22
+ && apt-get clean
23
+
24
+ # 5. 复制依赖文件并安装 (利用 Docker 层缓存)
25
+ COPY requirements.txt .
26
+
27
+ # 6. 安装 Python 依赖
28
+ RUN pip install --no-cache-dir --upgrade pip && \
29
+ pip install --no-cache-dir -r requirements.txt
30
+
31
+ # 7. 复制项目代码
32
+ COPY . .
33
+
34
+ # 8. 创建数据目录 (Qdrant 本地存储 + 上下文缓存)
35
+ RUN mkdir -p /app/data/qdrant_db /app/data/contexts
36
+
37
+ # 9. 暴露端口
38
+ EXPOSE 8000
39
+
40
+ # 10. 健康检查
41
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
42
+ CMD curl -f http://localhost:8000/health || exit 1
43
+
44
+ # 11. 启动命令
45
+ CMD ["gunicorn", "-c", "gunicorn_conf.py", "app.main:app"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 tzzp1224
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: RepoReaper
3
+ emoji: 💀
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ app_port: 8000
9
+ ---
10
+
11
+ <div align="center">
12
+
13
+ <img src="./docs/logo.jpg" width="800" style="max-width: 100%;" height="auto" alt="RepoReaper Logo">
14
+
15
+ <h1>RepoReaper</h1>
16
+
17
+ <h3>💀 Harvest Logic. Dissect Architecture. Chat with Code.</h3>
18
+
19
+ <p>
20
+ <a href="./README.md">English</a> •
21
+ <a href="./README_zh.md">简体中文</a>
22
+ </p>
23
+
24
+ <a href="./LICENSE">
25
+ <img src="https://img.shields.io/github/license/tzzp1224/RepoReaper?style=flat-square&color=blue" alt="License">
26
+ </a>
27
+ <img src="https://img.shields.io/badge/Python-3.10+-3776AB?style=flat-square&logo=python&logoColor=white" alt="Python Version">
28
+ <img src="https://img.shields.io/badge/Model-DeepSeek_V3-673AB7?style=flat-square&logo=openai&logoColor=white" alt="DeepSeek Powered">
29
+ <img src="https://img.shields.io/badge/Agent-ReAct-orange?style=flat-square" alt="Agent Architecture">
30
+
31
+ <br>
32
+
33
+ <img src="https://img.shields.io/badge/RAG-Hybrid_Search-009688?style=flat-square" alt="RAG">
34
+ <img src="https://img.shields.io/badge/VectorDB-Qdrant-important?style=flat-square" alt="Qdrant">
35
+ <img src="https://img.shields.io/badge/Framework-FastAPI-005571?style=flat-square&logo=fastapi&logoColor=white" alt="FastAPI">
36
+ <img src="https://img.shields.io/badge/Frontend-Vue_3-4FC08D?style=flat-square&logo=vue.js&logoColor=white" alt="Vue 3">
37
+ <img src="https://img.shields.io/badge/Docker-Ready-2496ED?style=flat-square&logo=docker&logoColor=white" alt="Docker">
38
+
39
+ <br>
40
+ <br>
41
+
42
+ <p>
43
+ <b>👇 Live Demo / 在线体验 👇</b>
44
+ </p>
45
+ <p align="center">
46
+ <a href="https://realdexter-reporeaper.hf.space" target="_blank" rel="noopener noreferrer">
47
+ <img src="https://img.shields.io/badge/🤗%20Hugging%20Face-Global%20Demo-ffd21e?style=for-the-badge&logo=huggingface&logoColor=black" alt="Global Demo" height="45">
48
+ </a>
49
+ &nbsp;&nbsp;&nbsp;
50
+ <a href="https://repo.realdexter.com/" target="_blank" rel="noopener noreferrer">
51
+ <img src="https://img.shields.io/badge/🚀%20Seoul%20Server-CN%20Optimized-red?style=for-the-badge&logo=rocket&logoColor=white" alt="China Demo" height="45">
52
+ </a>
53
+ </p>
54
+
55
+ <p align="center">
56
+ <small>
57
+ ⚠️ Public demos use shared API quotas. Deploy locally for the best experience.
58
+ </small>
59
+ </p>
60
+
61
+ <br>
62
+
63
+ <img src="./docs/demo_preview.gif" width="800" style="max-width: 100%; box-shadow: 0 4px 8px rgba(0,0,0,0.1); border-radius: 8px;" alt="RepoReaper Demo">
64
+
65
+ <br>
66
+ </div>
67
+
68
+ ---
69
+
70
+ An autonomous Agent that dissects any GitHub repository. It maps code architecture, warms up semantic cache, and answers questions with Just-In-Time context retrieval.
71
+
72
+ ---
73
+
74
+ ## ✨ Key Features
75
+
76
+ | Feature | Description |
77
+ |:--------|:------------|
78
+ | **Multi-Language AST Parsing** | Python AST + Regex patterns for Java, TypeScript, Go, Rust, etc. |
79
+ | **Hybrid Search** | Qdrant vectors + BM25 with RRF fusion |
80
+ | **JIT Context Loading** | Auto-fetches missing files during Q&A |
81
+ | **Query Rewrite** | Translates natural language to code keywords |
82
+ | **End-to-End Tracing** | Langfuse integration for observability |
83
+ | **Auto Evaluation** | LLM-as-Judge scoring pipeline |
84
+
85
+ ---
86
+
87
+ ## 🏗 Architecture
88
+
89
+ ```
90
+ ┌─────────────────────────────────────────────────────────────┐
91
+ │ Vue 3 Frontend (SSE Streaming + Mermaid Diagrams) │
92
+ └─────────────────────┬───────────────────────────────────────┘
93
+
94
+ ┌─────────────────────▼───────────────────────────────────────┐
95
+ │ FastAPI Backend │
96
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
97
+ │ │ Agent │ │ Chat │ │ Evaluation │ │
98
+ │ │ Service │ │ Service │ │ Framework │ │
99
+ │ └──────┬──────┘ └──────┬──────┘ └─────────────────────┘ │
100
+ │ │ │ │
101
+ │ ┌──────▼───────────────▼──────┐ ┌─────────────────────┐ │
102
+ │ │ Vector Service (Qdrant+BM25)│ │ Tracing (Langfuse) │ │
103
+ │ └─────────────────────────────┘ └─────────────────────┘ │
104
+ └─────────────────────────────────────────────────────────────┘
105
+ ```
106
+
107
+ ---
108
+
109
+ ## 🛠 Tech Stack
110
+
111
+ **Backend:** Python 3.10+ · FastAPI · AsyncIO · Qdrant · BM25
112
+ **Frontend:** Vue 3 · Pinia · Mermaid.js · SSE
113
+ **LLM:** DeepSeek V3 · SiliconFlow BGE-M3
114
+ **Ops:** Docker · Gunicorn · Langfuse
115
+
116
+ ---
117
+
118
+ ## 🏁 Quick Start
119
+
120
+ **Prerequisites:** Python 3.10+ · (Optional) Node 18+ for rebuilding frontend · GitHub Token (recommended) · LLM API Key (required)
121
+
122
+ ```bash
123
+ # Clone & Setup
124
+ git clone https://github.com/tzzp1224/RepoReaper.git && cd RepoReaper
125
+ python -m venv venv && source venv/bin/activate
126
+ pip install -r requirements.txt
127
+
128
+ # Configure .env (copy from example and fill in your keys)
129
+ cp .env.example .env
130
+ # Required: set LLM_PROVIDER and the matching *_API_KEY
131
+ # Recommended: GITHUB_TOKEN and SILICON_API_KEY (embeddings)
132
+
133
+ # (Optional) Build frontend (repo already contains frontend-dist)
134
+ cd frontend-vue
135
+ npm install
136
+ npm run build
137
+ cd ..
138
+
139
+ # Run
140
+ python -m app.main
141
+ ```
142
+
143
+ Open `http://localhost:8000` and paste any GitHub repo URL.
144
+
145
+ **Docker (single container, local Qdrant):**
146
+ ```bash
147
+ cp .env.example .env
148
+ docker build -t reporeaper .
149
+ docker run -d -p 8000:8000 --env-file .env reporeaper
150
+ ```
151
+
152
+ **Docker Compose (recommended, with Qdrant Server):**
153
+ ```bash
154
+ cp .env.example .env
155
+ # Set QDRANT_MODE=server and QDRANT_URL=http://qdrant:6333 in .env
156
+ docker compose up -d --build
157
+ ```
158
+
159
+
160
+
161
+
162
+
163
+ ## 📊 Evaluation & Tracing Status
164
+
165
+ | Component | Status | Notes |
166
+ |:----------|:------:|:------|
167
+ | **Self-built Eval Engine** | ✅ Working | 4-layer metrics (QueryRewrite / Retrieval / Generation / Agentic), LLM-as-Judge |
168
+ | **Auto Evaluation** | ✅ Working | Triggers after every `/chat`, async, writes to `evaluation/sft_data/` |
169
+ | **Data Routing (SFT)** | ✅ Working | Auto-grades Gold/Silver/Bronze/Rejected → JSONL files |
170
+ | **Eval API Endpoints** | ✅ Working | `/evaluate`, `/evaluation/stats`, `/dashboard/*`, `/auto-eval/*` (7 endpoints) |
171
+ | **Offline Retrieval Eval** | ✅ Working | `test_retrieval.py` — Hit Rate, Recall@K, Precision@K, MRR |
172
+ | **Langfuse Tracing** | ⚠️ Partial | Framework + 14 call sites wired in agent/chat services; falls back to local JSON logs (`logs/traces/`) when Langfuse unavailable |
173
+ | **Ragas Integration** | ❌ Placeholder | `use_ragas=False` by default; `_ragas_eval()` API call doesn't match latest Ragas SDK |
174
+ | **Langfuse ↔ Eval** | ❌ Not connected | Eval results only write JSONL, not reported to Langfuse Scores API |
175
+
176
+ > **Overall completion: ~65%** — the self-built eval loop is production-ready; Ragas and Langfuse integrations are scaffolded but not functional.
177
+
178
+ ---
179
+
180
+ ## ⚠️ Known Issues
181
+
182
+ 1. **Python 3.14 + Langfuse import error**
183
+ `pydantic.V1.errors.ConfigError: unable to infer type for attribute "description"` — Langfuse 3.x internally uses `pydantic.v1` compat layer which breaks on Python 3.14.
184
+ **Workaround:** set `LANGFUSE_ENABLED=false` in `.env`, or use Python 3.10–3.12.
185
+
186
+ 2. **Langfuse Server not included in `docker-compose.yml`**
187
+ Even if the import works, you need a running Langfuse instance. Add it yourself or use [app.langfuse.com](https://app.langfuse.com).
188
+
189
+ 3. **Trace spans are not linked**
190
+ `tracing_service` records spans/events but doesn't pass `trace_id` to Langfuse API calls — the Langfuse UI will show isolated events instead of a connected trace tree.
191
+
192
+ 4. **Ragas `_ragas_eval()` uses outdated API**
193
+ Passes a plain dict to `ragas.evaluate()`, but latest Ragas requires a `Dataset` object. The `ragas_eval_dataset.json` export exists but no script consumes it.
194
+
195
+ 5. **Golden dataset has no reference answers**
196
+ All 26 test cases have `expected_answer: ""` — generation quality cannot be compared against ground truth.
197
+
198
+ 6. **Heuristic fallback is coarse**
199
+ When no LLM client is available, `faithfulness` uses keyword overlap + 0.2 baseline; `completeness` is purely length-based.
200
+
201
+ ---
202
+
203
+ ## 🗺 Roadmap
204
+
205
+ - [ ] **Fix Langfuse compat** — pin `langfuse`/`pydantic` versions or gate import behind Python version check
206
+ - [ ] **Add Langfuse to `docker-compose.yml`** — one-command local observability
207
+ - [ ] **Wire trace_id through spans** — enable full trace tree in Langfuse UI
208
+ - [ ] **Integrate Ragas properly** — update `_ragas_eval()` to use `ragas.evaluate(Dataset(...))`, add a standalone eval script
209
+ - [ ] **Enrich golden dataset** — add `expected_answer` for generation benchmarking, expand to 50+ cases
210
+ - [ ] **Eval dashboard frontend** — Vue component to visualize quality distribution and bad cases
211
+ - [ ] **CI regression baseline** — run `test_retrieval.py` in GitHub Actions, fail on metric regression
212
+ - [ ] **Export to Langfuse Datasets** — push eval results to Langfuse Scores/Datasets API for unified observability
213
+
214
+ ---
215
+
216
+ ## 📈 Star History
217
+
218
+ <a href="https://star-history.com/#tzzp1224/RepoReaper&Date">
219
+ <picture>
220
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date&theme=dark" />
221
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date" />
222
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date" />
223
+ </picture>
224
+ </a>
README_zh.md ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <img src="./docs/logo.jpg" width="800" style="max-width: 100%;" height="auto" alt="RepoReaper Logo">
4
+
5
+ <h1>RepoReaper</h1>
6
+
7
+ <h3>💀 Harvest Logic. Dissect Architecture. Chat with Code.</h3>
8
+
9
+ <p>
10
+ <a href="./README.md">English</a> •
11
+ <strong>简体中文</strong>
12
+ </p>
13
+
14
+ <a href="./LICENSE">
15
+ <img src="https://img.shields.io/github/license/tzzp1224/RepoReaper?style=flat-square&color=blue" alt="License">
16
+ </a>
17
+ <img src="https://img.shields.io/badge/Python-3.10+-3776AB?style=flat-square&logo=python&logoColor=white" alt="Python Version">
18
+ <img src="https://img.shields.io/badge/Model-DeepSeek_V3-673AB7?style=flat-square&logo=openai&logoColor=white" alt="DeepSeek Powered">
19
+ <img src="https://img.shields.io/badge/Agent-ReAct-orange?style=flat-square" alt="Agent Architecture">
20
+
21
+ <br>
22
+
23
+ <img src="https://img.shields.io/badge/RAG-Hybrid_Search-009688?style=flat-square" alt="RAG">
24
+ <img src="https://img.shields.io/badge/VectorDB-Qdrant-important?style=flat-square" alt="Qdrant">
25
+ <img src="https://img.shields.io/badge/Framework-FastAPI-005571?style=flat-square&logo=fastapi&logoColor=white" alt="FastAPI">
26
+ <img src="https://img.shields.io/badge/Frontend-Vue_3-4FC08D?style=flat-square&logo=vue.js&logoColor=white" alt="Vue 3">
27
+ <img src="https://img.shields.io/badge/Docker-Ready-2496ED?style=flat-square&logo=docker&logoColor=white" alt="Docker">
28
+
29
+ <br>
30
+ <br>
31
+
32
+ <p>
33
+ <b>👇 在线体验 👇</b>
34
+ </p>
35
+ <p align="center">
36
+ <a href="https://realdexter-reporeaper.hf.space" target="_blank" rel="noopener noreferrer">
37
+ <img src="https://img.shields.io/badge/🤗%20Hugging%20Face-Global%20Demo-ffd21e?style=for-the-badge&logo=huggingface&logoColor=black" alt="Global Demo" height="45">
38
+ </a>
39
+ &nbsp;&nbsp;&nbsp;
40
+ <a href="https://repo.realdexter.com/" target="_blank" rel="noopener noreferrer">
41
+ <img src="https://img.shields.io/badge/🚀%20Seoul%20Server-国内优化-red?style=for-the-badge&logo=rocket&logoColor=white" alt="China Demo" height="45">
42
+ </a>
43
+ </p>
44
+
45
+ <p align="center">
46
+ <small>
47
+ ⚠️ 中国用户请使用 Seoul Server。如遇限流,建议本地部署。
48
+ </small>
49
+ </p>
50
+
51
+ <br>
52
+
53
+ <img src="./docs/demo_preview.gif" width="800" style="max-width: 100%; box-shadow: 0 4px 8px rgba(0,0,0,0.1); border-radius: 8px;" alt="RepoReaper Demo">
54
+
55
+ <br>
56
+ </div>
57
+
58
+ ---
59
+
60
+ 自治型代码审计 Agent:解析任意 GitHub 仓库架构,构建语义缓存,支持即时上下文检索问答。
61
+
62
+ ---
63
+
64
+ ## ✨ 核心特性
65
+
66
+ | 特性 | 说明 |
67
+ |:----|:----|
68
+ | **多语言 AST 解析** | Python AST + 正则适配 Java / TS / Go / Rust 等 |
69
+ | **混合检索** | Qdrant 向量 + BM25 关键词,RRF 融合排序 |
70
+ | **JIT 动态加载** | 问答时自动拉取缺失文件 |
71
+ | **查询重写** | 自然语言 → 代码检索关键词 |
72
+ | **端到端追踪** | Langfuse 集成,全链路可观测 |
73
+ | **自动评估** | LLM-as-Judge 质量评分 |
74
+
75
+ ---
76
+
77
+ ## 🏗 系统架构
78
+
79
+ ```
80
+ ┌─────────────────────────────────────────────────────────────┐
81
+ │ Vue 3 前端 (SSE 流式 + Mermaid 架构图) │
82
+ └─────────────────────┬───────────────────────────────────────┘
83
+
84
+ ┌─────────────────────▼───────────────────────────────────────┐
85
+ │ FastAPI 后端 │
86
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
87
+ │ │ Agent │ │ Chat │ │ Evaluation │ │
88
+ │ │ Service │ │ Service │ │ Framework │ │
89
+ │ └──────┬──────┘ └──────┬──────┘ └─────────────────────┘ │
90
+ │ │ │ │
91
+ │ ┌──────▼───────────────▼──────┐ ┌─────────────────────┐ │
92
+ │ │ Vector Service (Qdrant+BM25)│ │ Tracing (Langfuse) │ │
93
+ │ └─────────────────────────────┘ └─────────────────────┘ │
94
+ └─────────────────────────────────────────────────────────────┘
95
+ ```
96
+
97
+ ---
98
+
99
+ ## 🛠 技术栈
100
+
101
+ **后端:** Python 3.10+ · FastAPI · AsyncIO · Qdrant · BM25
102
+ **前端:** Vue 3 · Pinia · Mermaid.js · SSE
103
+ **模型:** DeepSeek V3 · SiliconFlow BGE-M3
104
+ **运维:** Docker · Gunicorn · Langfuse
105
+
106
+ ---
107
+
108
+ ## 🏁 快速开始
109
+
110
+ **前置要求:** Python 3.10+ ·(可选)Node 18+ 用于重新构建前端 · GitHub Token(推荐)· LLM API Key(必需)
111
+
112
+ ```bash
113
+ # 克隆 & 安装
114
+ git clone https://github.com/tzzp1224/RepoReaper.git && cd RepoReaper
115
+ python -m venv venv && source venv/bin/activate
116
+ pip install -r requirements.txt
117
+
118
+ # 配置 .env(建议从示例复制)
119
+ cp .env.example .env
120
+ # 必需:设置 LLM_PROVIDER 以及对应的 *_API_KEY
121
+ # 推荐:GITHUB_TOKEN 和 SILICON_API_KEY(Embedding)
122
+
123
+ # (可选)构建前端(仓库已包含 frontend-dist)
124
+ cd frontend-vue
125
+ npm install
126
+ npm run build
127
+ cd ..
128
+
129
+ # 启动
130
+ python -m app.main
131
+ ```
132
+
133
+ 访问 `http://localhost:8000`,输入任意 GitHub 仓库地址开始审计。
134
+
135
+ **Docker(单容器,本地 Qdrant):**
136
+ ```bash
137
+ cp .env.example .env
138
+ docker build -t reporeaper .
139
+ docker run -d -p 8000:8000 --env-file .env reporeaper
140
+ ```
141
+
142
+ **Docker Compose(推荐,包含 Qdrant Server):**
143
+ ```bash
144
+ cp .env.example .env
145
+ # 在 .env 中设置 QDRANT_MODE=server 与 QDRANT_URL=http://qdrant:6333
146
+ docker compose up -d --build
147
+ ```
148
+
149
+ ---
150
+
151
+ ## 📊 评估与追踪现状
152
+
153
+ | 组件 | 状态 | 说明 |
154
+ |:----|:----:|:----|
155
+ | **自研评估引擎** | ✅ 可用 | 四层指标(QueryRewrite / Retrieval / Generation / Agentic),LLM-as-Judge 判分 |
156
+ | **在线自动评估** | ✅ 可用 | 每次 `/chat` 结束后异步触发,结果写入 `evaluation/sft_data/` |
157
+ | **数据路由 (SFT)** | ✅ 可用 | 按评分自动分流 Gold/Silver/Bronze/Rejected → JSONL 文件 |
158
+ | **评估 API** | ✅ 可用 | `/evaluate`、`/evaluation/stats`、`/dashboard/*`、`/auto-eval/*` 共 7 个端点 |
159
+ | **离线检索评估** | ✅ 可用 | `test_retrieval.py` — Hit Rate、Recall@K、Precision@K、MRR |
160
+ | **Langfuse 追踪** | ⚠️ 部分完成 | 框架 + 14 处埋点已就位(agent/chat service);不可用时自动降级为本地日志 `logs/traces/` |
161
+ | **Ragas 集成** | ❌ 占位 | 默认 `use_ragas=False`;`_ragas_eval()` 调用方式与最新 Ragas SDK 不兼容 |
162
+ | **Langfuse ↔ 评估** | ❌ 未打通 | 评估结果仅写 JSONL,未上报 Langfuse Scores API |
163
+
164
+ > **综合完成度约 65%**:自研评估链路已闭环可用;Ragas 与 Langfuse 集成均为半成品。
165
+
166
+ ---
167
+
168
+ ## ⚠️ 已知问题
169
+
170
+ 1. **Python 3.14 + Langfuse 导入报错**
171
+ `pydantic.V1.errors.ConfigError: unable to infer type for attribute "description"` — Langfuse 3.x 内部依赖 `pydantic.v1` 兼容层,在 Python 3.14 下不兼容。
172
+ **临时方案:** 在 `.env` 中设置 `LANGFUSE_ENABLED=false`,或使用 Python 3.10–3.12。
173
+
174
+ 2. **`docker-compose.yml` 未包含 Langfuse 服务**
175
+ 即使导入成功,仍需运行中的 Langfuse 实例。请自行添加或使用 [app.langfuse.com](https://app.langfuse.com)。
176
+
177
+ 3. **Trace 链路未关联**
178
+ `tracing_service` 记录了 span/event,但调用 Langfuse API 时未传 `trace_id`,Langfuse UI 中只能看到孤立事件而非完整链路树。
179
+
180
+ 4. **Ragas `_ragas_eval()` API 过时**
181
+ 当前向 `ragas.evaluate()` 传递 dict,最新 Ragas 要求 `Dataset` 对象。已导出 `ragas_eval_dataset.json` 但无脚本消费它。
182
+
183
+ 5. **黄金数据集缺少标准答案**
184
+ 26 条测试用例的 `expected_answer` 均为空,无法做生成质量的 ground truth 对比。
185
+
186
+ 6. **启发式降级较粗糙**
187
+ 无 LLM client 时,`faithfulness` 用关键词重叠 + 0.2 基础分;`completeness` 纯粹按字数判断。
188
+
189
+ ---
190
+
191
+ ## 🗺 路线图
192
+
193
+ - [ ] **修复 Langfuse 兼容性** — 固定 `langfuse`/`pydantic` 版本或按 Python 版本门控导入
194
+ - [ ] **`docker-compose.yml` 加入 Langfuse** — 一键启动本地可观测平台
195
+ - [ ] **串联 trace_id** — 让 Langfuse UI 展示完整链路树
196
+ - [ ] **正式接入 Ragas** — 更新 `_ragas_eval()` 使用 `ragas.evaluate(Dataset(...))`,新增独立评估脚本
197
+ - [ ] **丰富黄金数据集** — 补充 `expected_answer`,扩展至 50+ 条用例
198
+ - [ ] **评估仪表盘前端** — Vue 组件可视化质量分布与 Bad Case
199
+ - [ ] **CI 回归基线** — 在 GitHub Actions 中运行 `test_retrieval.py`,指标回退时失败
200
+ - [ ] **对接 Langfuse Datasets** — 将评估结果推送到 Langfuse Scores/Datasets API,统一可观测
201
+
202
+ ---
203
+
204
+ ## 📈 Star History
205
+
206
+ <a href="https://star-history.com/#tzzp1224/RepoReaper&Date">
207
+ <picture>
208
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date&theme=dark" />
209
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date" />
210
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=tzzp1224/RepoReaper&type=Date" />
211
+ </picture>
212
+ </a>
app/core/config.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/core/config.py
2
+ """
3
+ 应用配置模块 - 统一配置中心
4
+
5
+ 支持多 LLM 供应商配置:
6
+ - OpenAI (GPT-4, GPT-4o 等)
7
+ - DeepSeek (deepseek-chat 等)
8
+ - Anthropic (Claude 系列)
9
+ - Google Gemini (gemini-3-flash-preview 等)
10
+ """
11
+ import os
12
+ from dataclasses import dataclass, field
13
+ from typing import Optional, Tuple
14
+ from dotenv import load_dotenv
15
+
16
+ # 加载 .env 文件
17
+ load_dotenv()
18
+
19
+
20
+ # ============================================================
21
+ # Agent 分析配置
22
+ # ============================================================
23
+
24
+ @dataclass
25
+ class AgentAnalysisConfig:
26
+ """Agent 分析引擎配置"""
27
+ # Repo Map 配置
28
+ initial_map_limit: int = 25 # 初始 Repo Map 文件数量 (提高精度)
29
+ max_symbols_per_file: int = 40 # 每文件最大符号数 (提高精度)
30
+
31
+ # 分析轮次配置
32
+ max_rounds: int = 4 # 最大分析轮数 (提高精度,因为报告可复用)
33
+ files_per_round: int = 5 # 每轮选择文件数 (提高精度)
34
+ max_context_length: int = 20000 # 上下文最大长度 (提高精度)
35
+
36
+ # 优先级配置
37
+ priority_exts: Tuple[str, ...] = (
38
+ '.py', '.java', '.go', '.js', '.ts', '.tsx', '.cpp', '.cs', '.rs'
39
+ )
40
+ priority_keywords: Tuple[str, ...] = (
41
+ 'main', 'app', 'core', 'api', 'service', 'utils', 'controller', 'model', 'config'
42
+ )
43
+
44
+
45
+ # ============================================================
46
+ # 向量服务配置
47
+ # ============================================================
48
+
49
+ @dataclass
50
+ class VectorServiceConfig:
51
+ """向量服务配置"""
52
+ # 数据目录
53
+ data_dir: str = "data"
54
+ context_dir: str = "data/contexts"
55
+ cache_version: str = "2.0"
56
+
57
+ # Embedding 配置
58
+ embedding_api_url: str = "https://api.siliconflow.cn/v1"
59
+ embedding_model: str = "BAAI/bge-m3"
60
+ embedding_batch_size: int = 50
61
+ embedding_max_length: int = 8000
62
+ embedding_concurrency: int = 5
63
+ embedding_dimensions: int = 1024
64
+
65
+ # BM25 配置
66
+ tokenize_regex: str = r'[^a-zA-Z0-9_\.@\u4e00-\u9fa5]+'
67
+
68
+ # 混合搜索 RRF 参数
69
+ rrf_k: int = 60
70
+ rrf_weight_vector: float = 1.0
71
+ rrf_weight_bm25: float = 0.3
72
+ search_oversample: int = 2
73
+ default_top_k: int = 3
74
+
75
+ # Session LRU 缓存配置
76
+ session_max_count: int = 100 # 内存中最大 session 数
77
+
78
+
79
+ # ============================================================
80
+ # 对话记忆配置
81
+ # ============================================================
82
+
83
+ @dataclass
84
+ class ConversationConfig:
85
+ """对话记忆配置"""
86
+ # 滑动窗口
87
+ max_recent_turns: int = 10 # 保留最近 N 轮对话
88
+ max_context_tokens: int = 8000 # 最大上下文 token 数
89
+ summary_threshold: int = 15 # 超过 N 轮开始压缩
90
+ # 对话记忆是纯内存存储,服务重启自动清空,无需定时清理
91
+
92
+
93
+ # ============================================================
94
+ # Qdrant 配置
95
+ # ============================================================
96
+
97
+ @dataclass
98
+ class QdrantServiceConfig:
99
+ """
100
+ Qdrant 向量数据库配置
101
+
102
+ 支持三种模式 (通过环境变量 QDRANT_MODE 切换):
103
+ - local: 本地嵌入式存储 (开发环境, 单 Worker)
104
+ - server: Qdrant Server Docker (生产环境, 多 Worker)
105
+ - cloud: Qdrant Cloud 托管服务
106
+
107
+ 环境变量:
108
+ - QDRANT_MODE: "local" | "server" | "cloud"
109
+ - QDRANT_URL: 服务器 URL (server/cloud 模式)
110
+ - QDRANT_API_KEY: API 密钥 (cloud 模式必需)
111
+ - QDRANT_LOCAL_PATH: 本地存储路径 (local 模式)
112
+ """
113
+ mode: str = os.getenv("QDRANT_MODE", "local")
114
+ url: str = os.getenv("QDRANT_URL", "")
115
+ host: str = os.getenv("QDRANT_HOST", "localhost")
116
+ port: int = int(os.getenv("QDRANT_PORT", "6333"))
117
+ grpc_port: int = int(os.getenv("QDRANT_GRPC_PORT", "6334"))
118
+ prefer_grpc: bool = True
119
+ api_key: str = os.getenv("QDRANT_API_KEY", "")
120
+
121
+ local_path: str = os.getenv("QDRANT_LOCAL_PATH", "data/qdrant_db")
122
+
123
+ vector_size: int = 1024 # BGE-M3 维度
124
+ hnsw_m: int = 16
125
+ hnsw_ef_construct: int = 100
126
+ batch_size: int = 100
127
+ timeout: float = 30.0
128
+
129
+
130
+ # ============================================================
131
+ # LLM 供应商配置
132
+ # ============================================================
133
+
134
+
135
+ class Settings:
136
+ """应用配置类"""
137
+
138
+ # --- LLM 供应商选择 ---
139
+ # 支持: "openai", "deepseek", "anthropic", "gemini"
140
+ LLM_PROVIDER = os.getenv("LLM_PROVIDER", "deepseek")
141
+
142
+ # --- API Keys (根据选择的供应商配置对应的 Key) ---
143
+ GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
144
+
145
+ # OpenAI
146
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
147
+ OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL") # 可选自定义端点
148
+
149
+ # DeepSeek
150
+ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
151
+ DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
152
+
153
+ # Anthropic (Claude)
154
+ ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
155
+
156
+ # Google Gemini
157
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
158
+ GEMINI_BASE_URL = os.getenv("GEMINI_BASE_URL") # 可选 OpenAI 兼容端点
159
+
160
+ # SiliconFlow (Embedding)
161
+ SILICON_API_KEY = os.getenv("SILICON_API_KEY")
162
+
163
+ # --- 模型配置 ---
164
+ # 如果不指定,将使用各供应商的默认模型
165
+ MODEL_NAME = os.getenv("MODEL_NAME")
166
+
167
+ # --- 服务配置 ---
168
+ HOST = os.getenv("HOST", "127.0.0.1")
169
+ PORT = int(os.getenv("PORT", 8000))
170
+
171
+ # --- LLM 默认参数 ---
172
+ LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.1"))
173
+ LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "4096"))
174
+ LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "600"))
175
+
176
+ @property
177
+ def current_api_key(self) -> Optional[str]:
178
+ """获取当前选择的供应商的 API Key"""
179
+ key_mapping = {
180
+ "openai": self.OPENAI_API_KEY,
181
+ "deepseek": self.DEEPSEEK_API_KEY,
182
+ "anthropic": self.ANTHROPIC_API_KEY,
183
+ "gemini": self.GEMINI_API_KEY,
184
+ }
185
+ return key_mapping.get(self.LLM_PROVIDER.lower())
186
+
187
+ @property
188
+ def current_base_url(self) -> Optional[str]:
189
+ """获取当前选择的供应商的 Base URL"""
190
+ url_mapping = {
191
+ "openai": self.OPENAI_BASE_URL,
192
+ "deepseek": self.DEEPSEEK_BASE_URL,
193
+ "anthropic": None,
194
+ "gemini": self.GEMINI_BASE_URL,
195
+ }
196
+ return url_mapping.get(self.LLM_PROVIDER.lower())
197
+
198
+ @property
199
+ def default_model_name(self) -> str:
200
+ """获取当前供应商的默认模型名称"""
201
+ defaults = {
202
+ "openai": "gpt-4o-mini",
203
+ "deepseek": "deepseek-chat",
204
+ "anthropic": "claude-3-5-sonnet-20241022",
205
+ "gemini": "gemini-3-flash-preview",
206
+ }
207
+ return self.MODEL_NAME or defaults.get(self.LLM_PROVIDER.lower(), "default")
208
+
209
+ def validate(self):
210
+ """启动时检查必要的配置是否存在"""
211
+ provider = self.LLM_PROVIDER.lower()
212
+ print(f"🔧 LLM Provider: {provider.upper()}")
213
+
214
+ # 1. 检查选择的供应商的 API Key
215
+ if not self.current_api_key:
216
+ key_name = f"{provider.upper()}_API_KEY"
217
+ raise ValueError(
218
+ f"❌ 错误: 缺少 {key_name}。\n"
219
+ f" 当前选择的 LLM 供应商是: {provider}\n"
220
+ f" 请在 .env 文件中设置 {key_name},或更改 LLM_PROVIDER 为其他供应商。"
221
+ )
222
+
223
+ # 2. 检查 SiliconCloud Key (Embedding 功能)
224
+ if not self.SILICON_API_KEY:
225
+ print("⚠️ 警告: 未找到 SILICON_API_KEY,向量检索功能可能无法工作。")
226
+
227
+ # 3. 检查 GitHub Token (可选但建议)
228
+ if not self.GITHUB_TOKEN:
229
+ print("⚠️ 警告: 未找到 GITHUB_TOKEN,GitHub API 请求将受到每小时 60 次的严格限制。")
230
+
231
+ print(f"✅ 配置验证通过 (Model: {self.default_model_name})")
232
+
233
+
234
+ # ============================================================
235
+ # 全局配置实例
236
+ # ============================================================
237
+
238
+ # LLM 设置
239
+ settings = Settings()
240
+ settings.validate()
241
+
242
+ # 子系统配置
243
+ agent_config = AgentAnalysisConfig()
244
+ vector_config = VectorServiceConfig()
245
+ conversation_config = ConversationConfig()
246
+ qdrant_config = QdrantServiceConfig()
app/main.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/main.py
2
+ import sys
3
+ import io
4
+ import os
5
+ import asyncio
6
+ from contextlib import asynccontextmanager
7
+
8
+ # 强制 stdout 使用 utf-8,防止 Windows 控制台乱码
9
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
10
+
11
+ from fastapi import FastAPI, Request
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from sse_starlette.sse import EventSourceResponse
14
+ from fastapi.responses import StreamingResponse, HTMLResponse, JSONResponse
15
+ from fastapi.staticfiles import StaticFiles
16
+ import uvicorn
17
+
18
+ from app.core.config import settings
19
+ from app.services.agent_service import agent_stream
20
+ from app.services.chat_service import process_chat_stream, get_eval_data, clear_eval_data
21
+ from app.services.vector_service import store_manager
22
+ from app.services.auto_evaluation_service import (
23
+ init_auto_evaluation_service,
24
+ get_auto_evaluation_service,
25
+ EvaluationConfig
26
+ )
27
+ from evaluation.evaluation_framework import EvaluationEngine, EvaluationResult, DataRoutingEngine
28
+ from datetime import datetime
29
+ import uuid
30
+
31
+ settings.validate()
32
+
33
+ # === 生命周期管理 ===
34
+ @asynccontextmanager
35
+ async def lifespan(app: FastAPI):
36
+ """应用生命周期管理"""
37
+ from app.services.vector_service import store_manager
38
+
39
+ # 启动时运行
40
+ print("🚀 Application starting...")
41
+ # 仓库数据永久存储,对话记忆纯内存存储(重启自动清空)
42
+
43
+ yield
44
+
45
+ # 关闭时运行
46
+ print("🛑 Application shutting down...")
47
+
48
+ # 清理 GitHub 客户端连接
49
+ from app.utils.github_client import close_github_client
50
+ await close_github_client()
51
+
52
+ # 清理向量存储连接
53
+ await store_manager.close_all()
54
+
55
+ # 关闭共享的 Qdrant 客户端
56
+ from app.storage.qdrant_store import close_shared_client
57
+ await close_shared_client()
58
+
59
+ print("✅ Cleanup complete")
60
+
61
+ app = FastAPI(title="GitHub RAG Agent", lifespan=lifespan)
62
+
63
+ # === 初始化评估引擎 ===
64
+ from app.utils.llm_client import client
65
+ eval_engine = EvaluationEngine(llm_client=client, model_name=settings.default_model_name)
66
+ data_router = DataRoutingEngine()
67
+
68
+ # === 初始化自动评估服务 (Phase 1) ===
69
+ auto_eval_config = EvaluationConfig(
70
+ enabled=True,
71
+ use_ragas=False, # Phase 1: 先不用 Ragas,避免额外依赖
72
+ async_evaluation=True, # 异步模式,不阻塞响应
73
+ min_quality_score=0.4, # 最低分数阈值(0.4 = 只拒绝最差的)
74
+ min_query_length=10, # 最小 query 长度
75
+ min_answer_length=100, # 最小 answer 长度
76
+ require_repo_url=True, # 必须有仓库 URL
77
+ require_code_in_context=True # 上下文必须包含代码
78
+ )
79
+ auto_eval_service = init_auto_evaluation_service(
80
+ eval_engine=eval_engine,
81
+ data_router=data_router,
82
+ config=auto_eval_config
83
+ )
84
+ print("✅ Auto Evaluation Service Initialized")
85
+
86
+ app.add_middleware(
87
+ CORSMiddleware,
88
+ allow_origins=["*"],
89
+ allow_credentials=True,
90
+ allow_methods=["*"],
91
+ allow_headers=["*"],
92
+ )
93
+
94
+ # === 静态文件与前端 ===
95
+ app.mount("/static", StaticFiles(directory="app"), name="static")
96
+
97
+ # Vue 3 构建输出的静态资源 (JS/CSS/assets)
98
+ import os
99
+ FRONTEND_DIST = os.path.join(os.path.dirname(os.path.dirname(__file__)), "frontend-dist")
100
+ if os.path.exists(FRONTEND_DIST):
101
+ app.mount("/assets", StaticFiles(directory=os.path.join(FRONTEND_DIST, "assets")), name="vue-assets")
102
+
103
+ @app.get("/", response_class=HTMLResponse)
104
+ async def read_root():
105
+ # 优先使用 Vue 3 构建版本,否则回退到原版
106
+ vue_index = os.path.join(FRONTEND_DIST, "index.html")
107
+ if os.path.exists(vue_index):
108
+ with open(vue_index, "r", encoding="utf-8") as f:
109
+ return f.read()
110
+ # 回退到原版前端
111
+ with open("frontend/index.html", "r", encoding="utf-8") as f:
112
+ return f.read()
113
+
114
+ @app.get("/health")
115
+ def health_check():
116
+ return {"status": "ok"}
117
+
118
+ @app.get("/api/sessions")
119
+ async def get_sessions():
120
+ """获取 session 管理状态"""
121
+ return JSONResponse(store_manager.get_stats())
122
+
123
+ @app.post("/api/sessions/cleanup")
124
+ async def trigger_cleanup():
125
+ """手动触发过期文件清理"""
126
+ stats = await store_manager.cleanup_expired_files()
127
+ return JSONResponse({"message": "Cleanup completed", "stats": stats})
128
+
129
+ @app.delete("/api/sessions/{session_id}")
130
+ async def close_session(session_id: str):
131
+ """关闭指定 session"""
132
+ await store_manager.close_session(session_id)
133
+ return JSONResponse({"message": f"Session {session_id} closed"})
134
+
135
+
136
+ # === 仓库级 Session API ===
137
+
138
+ @app.post("/api/repo/check")
139
+ async def check_repo_session(request: Request):
140
+ """
141
+ 检查仓库是否已有指定语言的索引和报告
142
+
143
+ 请求: { "url": "https://github.com/owner/repo", "language": "zh" }
144
+ 响应: {
145
+ "exists": true/false,
146
+ "session_id": "repo_xxx",
147
+ "report": "..." (如果存在对应语言的报告),
148
+ "has_index": true/false,
149
+ "available_languages": ["en", "zh"]
150
+ }
151
+ """
152
+ from app.utils.session import generate_repo_session_id
153
+
154
+ data = await request.json()
155
+ repo_url = data.get("url", "").strip()
156
+ language = data.get("language", "en")
157
+
158
+ if not repo_url:
159
+ return JSONResponse({"error": "Missing URL"}, status_code=400)
160
+
161
+ # 生成基于仓库的 Session ID
162
+ session_id = generate_repo_session_id(repo_url)
163
+
164
+ # 检查是否存在
165
+ store = store_manager.get_store(session_id)
166
+
167
+ # 尝试加载上下文
168
+ context = store.load_context()
169
+
170
+ if context and context.get("repo_url"):
171
+ # 存在已分析的仓库
172
+ # 获取指定语言的报告
173
+ report = store.get_report(language)
174
+ available_languages = store.get_available_languages()
175
+ global_context = context.get("global_context", {})
176
+ has_index = bool(global_context.get("file_tree"))
177
+
178
+ return JSONResponse({
179
+ "exists": True,
180
+ "session_id": session_id,
181
+ "repo_url": context.get("repo_url"),
182
+ "report": report, # 指定语言的报告,可能为 None
183
+ "has_index": has_index,
184
+ "available_languages": available_languages,
185
+ "requested_language": language,
186
+ })
187
+ else:
188
+ return JSONResponse({
189
+ "exists": False,
190
+ "session_id": session_id,
191
+ "has_index": False,
192
+ "available_languages": [],
193
+ })
194
+
195
+
196
+ @app.get("/analyze")
197
+ async def analyze(url: str, session_id: str, language: str = "en", regenerate_only: bool = False):
198
+ """
199
+ 仓库分析端点
200
+
201
+ Args:
202
+ url: 仓库 URL
203
+ session_id: Session ID
204
+ language: 报告语言 ("en" 或 "zh")
205
+ regenerate_only: True 时跳过抓取/索引,直接使用已有索引生成新语言报告
206
+ """
207
+ if not session_id:
208
+ return {"error": "Missing session_id"}
209
+ return EventSourceResponse(agent_stream(url, session_id, language, regenerate_only))
210
+
211
+ @app.post("/chat")
212
+ async def chat(request: Request):
213
+ """
214
+ 聊天端点 - 自动评估版本
215
+
216
+ 改进点:
217
+ 1. 立即返回聊天结果(不阻塞)
218
+ 2. 后台异步进行自动评估
219
+ 3. 评估结果自动存储到 evaluation/sft_data/
220
+ """
221
+ data = await request.json()
222
+ user_query = data.get("query")
223
+ session_id = data.get("session_id")
224
+ repo_url = data.get("repo_url", "")
225
+
226
+ if not user_query:
227
+ return {"answer": "Please enter your question"}
228
+ if not session_id:
229
+ return {"answer": "Session lost"}
230
+
231
+ # 标记流是否完成
232
+ stream_completed = False
233
+
234
+ async def chat_stream_with_eval():
235
+ """包装 process_chat_stream,流结束后触发评估"""
236
+ nonlocal stream_completed
237
+
238
+ # 清除旧的评估数据
239
+ clear_eval_data(session_id)
240
+
241
+ # 执行聊天流
242
+ async for chunk in process_chat_stream(user_query, session_id):
243
+ yield chunk
244
+
245
+ # 流完成后标记
246
+ stream_completed = True
247
+
248
+ # 流结束后触发评估(此时数据已存储在 chat_service 中)
249
+ try:
250
+ auto_eval_service = get_auto_evaluation_service()
251
+ eval_data = get_eval_data(session_id)
252
+
253
+ if auto_eval_service and eval_data and eval_data.answer:
254
+ print(f"\n📊 [Auto-Eval] Starting evaluation for session {session_id}")
255
+ print(f" - Query: {user_query[:50]}...")
256
+ print(f" - Context length: {len(eval_data.retrieved_context)} chars")
257
+ print(f" - Answer length: {len(eval_data.answer)} chars")
258
+
259
+ # 异步执行评估(不阻塞流结束)
260
+ asyncio.create_task(
261
+ auto_eval_service.auto_evaluate_async(
262
+ query=user_query,
263
+ retrieved_context=eval_data.retrieved_context,
264
+ generated_answer=eval_data.answer,
265
+ session_id=session_id,
266
+ repo_url=repo_url,
267
+ language="zh" if any('\u4e00' <= c <= '\u9fff' for c in user_query) else "en"
268
+ )
269
+ )
270
+ else:
271
+ if not auto_eval_service:
272
+ print("⚠️ Auto evaluation service not initialized")
273
+ elif not eval_data:
274
+ print(f"⚠️ No eval data found for session {session_id}")
275
+ elif not eval_data.answer:
276
+ print(f"⚠️ Empty answer for session {session_id}")
277
+ except Exception as e:
278
+ print(f"⚠️ Failed to trigger auto-eval: {e}")
279
+ import traceback
280
+ traceback.print_exc()
281
+
282
+ # 返回流
283
+ return StreamingResponse(
284
+ chat_stream_with_eval(),
285
+ media_type="text/plain"
286
+ )
287
+
288
+ # ===== Phase 2: 新增评估端点 =====
289
+
290
+ @app.post("/evaluate")
291
+ async def evaluate(request: Request):
292
+ """
293
+ 评估端点: 接收生成结果,进行多维度评估
294
+
295
+ POST /evaluate
296
+ {
297
+ "query": "用户问题",
298
+ "retrieved_context": "检索到的文件内容",
299
+ "generated_answer": "生成的回答",
300
+ "session_id": "会话ID",
301
+ "repo_url": "仓库URL(可选)"
302
+ }
303
+ """
304
+ try:
305
+ data = await request.json()
306
+
307
+ # 提取必需字段
308
+ query = data.get("query")
309
+ retrieved_context = data.get("retrieved_context", "")
310
+ generated_answer = data.get("generated_answer")
311
+ session_id = data.get("session_id", "unknown")
312
+ repo_url = data.get("repo_url", "")
313
+
314
+ if not query or not generated_answer:
315
+ return {
316
+ "error": "Missing required fields: query, generated_answer",
317
+ "status": "failed"
318
+ }
319
+
320
+ # 调用评估引擎获取生成层指标
321
+ generation_metrics = await eval_engine.evaluate_generation(
322
+ query=query,
323
+ retrieved_context=retrieved_context,
324
+ generated_answer=generated_answer
325
+ )
326
+
327
+ # 构建完整的评估结果对象
328
+ evaluation_result = EvaluationResult(
329
+ session_id=session_id,
330
+ query=query,
331
+ repo_url=repo_url,
332
+ timestamp=datetime.now(),
333
+ language="en",
334
+ generation_metrics=generation_metrics
335
+ )
336
+
337
+ # 计算综合得分
338
+ evaluation_result.compute_overall_score()
339
+
340
+ # 数据路由: 根据得分将样本分类
341
+ quality_tier = data_router.route_sample(evaluation_result)
342
+
343
+ return {
344
+ "status": "success",
345
+ "evaluation": {
346
+ "faithfulness": generation_metrics.faithfulness,
347
+ "answer_relevance": generation_metrics.answer_relevance,
348
+ "answer_completeness": generation_metrics.answer_completeness,
349
+ "overall_score": evaluation_result.overall_score
350
+ },
351
+ "quality_tier": quality_tier,
352
+ "session_id": session_id
353
+ }
354
+
355
+ except Exception as e:
356
+ import traceback
357
+ traceback.print_exc()
358
+ return {
359
+ "error": str(e),
360
+ "status": "failed"
361
+ }
362
+
363
+
364
+ # ===== 自动评估相关端点 =====
365
+
366
+ @app.get("/auto-eval/review-queue")
367
+ async def get_review_queue():
368
+ """
369
+ 获取需要人工审查的样本列表
370
+
371
+ 这些是评估出现异常(自己的分数和Ragas分数差异过大)的样本
372
+ 需要人工判断哪个评估器更准确
373
+
374
+ GET /auto-eval/review-queue
375
+ """
376
+ try:
377
+ auto_eval_service = get_auto_evaluation_service()
378
+ if not auto_eval_service:
379
+ return {"error": "Auto evaluation service not initialized", "status": "failed"}
380
+
381
+ queue = auto_eval_service.get_review_queue()
382
+
383
+ return {
384
+ "status": "success",
385
+ "queue_size": len(queue),
386
+ "samples": [
387
+ {
388
+ "index": i,
389
+ "query": item["eval_result"].query,
390
+ "custom_score": item["custom_score"],
391
+ "ragas_score": item["ragas_score"],
392
+ "diff": item["diff"],
393
+ "quality_tier": item["eval_result"].data_quality_tier.value,
394
+ "timestamp": item["timestamp"]
395
+ }
396
+ for i, item in enumerate(queue)
397
+ ]
398
+ }
399
+ except Exception as e:
400
+ return {"error": str(e), "status": "failed"}
401
+
402
+
403
+ @app.post("/auto-eval/approve/{index}")
404
+ async def approve_sample(index: int):
405
+ """
406
+ 人工批准某个样本(接受该评估结果)
407
+
408
+ POST /auto-eval/approve/0
409
+ """
410
+ try:
411
+ auto_eval_service = get_auto_evaluation_service()
412
+ if not auto_eval_service:
413
+ return {"error": "Auto evaluation service not initialized", "status": "failed"}
414
+
415
+ auto_eval_service.approve_sample(index)
416
+
417
+ return {
418
+ "status": "success",
419
+ "message": f"Sample {index} approved and stored"
420
+ }
421
+ except Exception as e:
422
+ return {"error": str(e), "status": "failed"}
423
+
424
+
425
+ @app.post("/auto-eval/reject/{index}")
426
+ async def reject_sample(index: int):
427
+ """
428
+ 人工拒绝某个样本(抛弃该评估结果)
429
+
430
+ POST /auto-eval/reject/0
431
+ """
432
+ try:
433
+ auto_eval_service = get_auto_evaluation_service()
434
+ if not auto_eval_service:
435
+ return {"error": "Auto evaluation service not initialized", "status": "failed"}
436
+
437
+ auto_eval_service.reject_sample(index)
438
+
439
+ return {
440
+ "status": "success",
441
+ "message": f"Sample {index} rejected and removed from queue"
442
+ }
443
+ except Exception as e:
444
+ return {"error": str(e), "status": "failed"}
445
+
446
+
447
+ @app.get("/auto-eval/stats")
448
+ async def auto_eval_stats():
449
+ """
450
+ 获取自动评估统计信息
451
+
452
+ GET /auto-eval/stats
453
+ """
454
+ try:
455
+ auto_eval_service = get_auto_evaluation_service()
456
+ if not auto_eval_service:
457
+ return {"error": "Auto evaluation service not initialized", "status": "failed"}
458
+
459
+ queue = auto_eval_service.get_review_queue()
460
+
461
+ return {
462
+ "status": "success",
463
+ "auto_evaluation": {
464
+ "enabled": auto_eval_service.config.enabled,
465
+ "use_ragas": auto_eval_service.config.use_ragas,
466
+ "async_mode": auto_eval_service.config.async_evaluation,
467
+ "custom_weight": auto_eval_service.config.custom_weight,
468
+ "ragas_weight": auto_eval_service.config.ragas_weight,
469
+ "diff_threshold": auto_eval_service.config.diff_threshold
470
+ },
471
+ "review_queue_size": len(queue),
472
+ "last_update": datetime.now().isoformat()
473
+ }
474
+ except Exception as e:
475
+ return {"error": str(e), "status": "failed"}
476
+
477
+
478
+ @app.get("/evaluation/stats")
479
+ async def evaluation_stats():
480
+ """
481
+ 获取评估统计信息
482
+
483
+ GET /evaluation/stats
484
+ """
485
+ try:
486
+ stats = eval_engine.get_statistics()
487
+ return {
488
+ "status": "success",
489
+ "statistics": {
490
+ "total_evaluations": stats.get("total_evaluations", 0),
491
+ "average_score": stats.get("average_score", 0),
492
+ "quality_distribution": stats.get("quality_distribution", {}),
493
+ "top_issues": stats.get("top_issues", [])
494
+ }
495
+ }
496
+ except Exception as e:
497
+ return {
498
+ "error": str(e),
499
+ "status": "failed"
500
+ }
501
+
502
+
503
+ @app.get("/dashboard/quality-distribution")
504
+ async def quality_distribution():
505
+ """
506
+ 获取数据质量分布 (用于仪表盘)
507
+
508
+ GET /dashboard/quality-distribution
509
+ """
510
+ try:
511
+ distribution = data_router.get_distribution()
512
+ return {
513
+ "status": "success",
514
+ "distribution": {
515
+ "gold": distribution.get("gold", 0),
516
+ "silver": distribution.get("silver", 0),
517
+ "bronze": distribution.get("bronze", 0),
518
+ "rejected": distribution.get("rejected", 0),
519
+ "corrected": distribution.get("corrected", 0)
520
+ },
521
+ "timestamp": datetime.now().isoformat()
522
+ }
523
+ except Exception as e:
524
+ return {
525
+ "error": str(e),
526
+ "status": "failed"
527
+ }
528
+
529
+
530
+ @app.get("/dashboard/bad-cases")
531
+ async def bad_cases():
532
+ """
533
+ 获取低质量样本 (用于人工审核)
534
+
535
+ GET /dashboard/bad-cases
536
+ """
537
+ try:
538
+ bad_samples = data_router.get_bad_samples(limit=10)
539
+ return {
540
+ "status": "success",
541
+ "bad_cases": [
542
+ {
543
+ "query": s.get("query", ""),
544
+ "issue": s.get("issue", ""),
545
+ "score": s.get("score", 0)
546
+ }
547
+ for s in bad_samples
548
+ ],
549
+ "total_bad_cases": len(bad_samples)
550
+ }
551
+ except Exception as e:
552
+ return {
553
+ "error": str(e),
554
+ "status": "failed"
555
+ }
556
+
557
+
558
+ if __name__ == "__main__":
559
+ # 生产模式建议关掉 reload
560
+ uvicorn.run("app.main:app", host=settings.HOST, port=settings.PORT, reload=False)
app/services/agent_service.py ADDED
@@ -0,0 +1,779 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/services/agent_service.py
2
+ import json
3
+ import asyncio
4
+ import traceback
5
+ import re
6
+ import ast
7
+ import httpx
8
+ import time
9
+ from typing import Set, Tuple, List
10
+ from datetime import datetime
11
+ from app.core.config import settings, agent_config
12
+ from app.utils.llm_client import client
13
+ from app.utils.repo_lock import RepoLock
14
+ from app.services.github_service import get_repo_structure, get_file_content
15
+ from app.services.vector_service import store_manager
16
+ from app.services.chunking_service import UniversalChunker, ChunkingConfig
17
+ from app.services.tracing_service import tracing_service
18
+ from evaluation.evaluation_framework import EvaluationEngine, EvaluationResult, DataRoutingEngine
19
+
20
+ # === Helper: 鲁棒的 JSON 提取 ===
21
+ def extract_json_from_text(text):
22
+ try:
23
+ text = re.sub(r"^```(json)?|```$", "", text.strip(), flags=re.MULTILINE).strip()
24
+ return json.loads(text)
25
+ except:
26
+ pass
27
+ match = re.search(r"\[.*\]", text, re.DOTALL)
28
+ if match:
29
+ try: return json.loads(match.group(0))
30
+ except: pass
31
+ return []
32
+
33
+ # === 多语言符号提取 ===
34
+ def _extract_symbols(content, file_path):
35
+ """
36
+ 根据文件类型,智能提取 Class 和 Function 签名生成地图。
37
+ """
38
+ ext = file_path.split('.')[-1].lower() if '.' in file_path else ""
39
+
40
+ # 1. Python 使用 AST (最准)
41
+ if ext == 'py':
42
+ return _extract_symbols_python(content)
43
+
44
+ # 2. 其他语言使用正则 (Java, TS, JS, Go, C++)
45
+ elif ext in ['java', 'ts', 'tsx', 'js', 'jsx', 'go', 'cpp', 'cs', 'rs']:
46
+ return _extract_symbols_regex(content, ext)
47
+
48
+ return []
49
+
50
+ def _extract_symbols_python(content):
51
+ try:
52
+ tree = ast.parse(content)
53
+ symbols = []
54
+ for node in tree.body:
55
+ if isinstance(node, ast.ClassDef):
56
+ symbols.append(f" [C] {node.name}")
57
+ for sub in node.body:
58
+ if isinstance(sub, (ast.FunctionDef, ast.AsyncFunctionDef)):
59
+ if not sub.name.startswith("_") or sub.name == "__init__":
60
+ symbols.append(f" - {sub.name}")
61
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
62
+ symbols.append(f" [F] {node.name}")
63
+ return symbols
64
+ except:
65
+ return []
66
+
67
+ def _extract_symbols_regex(content, ext):
68
+ """
69
+ 针对类 C 语言的通用正则提取。
70
+ """
71
+ symbols = []
72
+ lines = content.split('\n')
73
+
74
+ # 定义各语言的正则模式
75
+ patterns = {
76
+ 'java': {
77
+ 'class': re.compile(r'(?:public|protected|private)?\s*(?:static|abstract)?\s*(?:class|interface|enum)\s+([a-zA-Z0-9_]+)'),
78
+ 'func': re.compile(r'(?:public|protected|private)\s+(?:static\s+)?[\w<>[\]]+\s+([a-zA-Z0-9_]+)\s*\(')
79
+ },
80
+ 'ts': {
81
+ 'class': re.compile(r'class\s+([a-zA-Z0-9_]+)'),
82
+ 'func': re.compile(r'(?:function\s+([a-zA-Z0-9_]+)|const\s+([a-zA-Z0-9_]+)\s*=\s*(?:async\s*)?\(|([a-zA-Z0-9_]+)\s*\([^)]*\)\s*[:\{])')
83
+ },
84
+ 'go': {
85
+ 'class': re.compile(r'type\s+([a-zA-Z0-9_]+)\s+(?:struct|interface)'),
86
+ 'func': re.compile(r'func\s+(?:(?:\(.*\)\s+)?([a-zA-Z0-9_]+)|([a-zA-Z0-9_]+)\()')
87
+ }
88
+ }
89
+
90
+ lang_key = 'java' if ext in ['java', 'cs', 'cpp', 'rs'] else 'go' if ext == 'go' else 'ts'
91
+ rules = patterns.get(lang_key, patterns['java'])
92
+
93
+ count = 0
94
+ for line in lines:
95
+ line = line.strip()
96
+ # === 正则解析优化 (过滤更多干扰项) ===
97
+ if not line or line.startswith(("//", "/*", "*", "#", "print", "console.")): continue
98
+ if count > agent_config.max_symbols_per_file: break
99
+
100
+ # 匹配类
101
+ c_match = rules['class'].search(line)
102
+ if c_match:
103
+ name = next((g for g in c_match.groups() if g), "Unknown")
104
+ symbols.append(f" [C] {name}")
105
+ count += 1
106
+ continue
107
+
108
+ # 匹配方法
109
+ if line.endswith('{') or "=>" in line:
110
+ f_match = rules['func'].search(line)
111
+ if f_match:
112
+ name = next((g for g in f_match.groups() if g), None)
113
+ # 增强过滤
114
+ if name and len(name) > 2 and name not in ['if', 'for', 'switch', 'while', 'catch', 'return']:
115
+ symbols.append(f" - {name}")
116
+ count += 1
117
+
118
+ return symbols
119
+
120
+ async def generate_repo_map(repo_url, file_list, limit=agent_config.initial_map_limit) -> Tuple[str, Set[str]]:
121
+ """
122
+ 生成增强版仓库地图 (多语言版)
123
+ Returns:
124
+ str: 地图字符串
125
+ set: 已包含在地图中的文件路径集合 (用于增量更新查重)
126
+ """
127
+ # === 扩展高优先级文件列表 (使用配置) ===
128
+ priority_files = [
129
+ f for f in file_list
130
+ if f.endswith(agent_config.priority_exts) and
131
+ (f.count('/') <= 2 or any(k in f.lower() for k in agent_config.priority_keywords))
132
+ ]
133
+
134
+ # 去重并截取
135
+ targets = sorted(list(set(priority_files)))[:limit]
136
+ remaining = [f for f in file_list if f not in targets]
137
+
138
+ repo_map_lines = []
139
+ mapped_files_set = set(targets) # === 记录已映射的文件 ===
140
+
141
+ async def process_file(path):
142
+ content = await get_file_content(repo_url, path)
143
+ if not content: return f"{path} (Read Failed)"
144
+
145
+ symbols = await asyncio.to_thread(_extract_symbols, content, path)
146
+
147
+ if symbols:
148
+ return f"{path}\n" + "\n".join(symbols)
149
+ return path
150
+
151
+ repo_map_lines.append(f"--- Key Files Structure (Top {len(targets)}) ---")
152
+
153
+ tasks = [process_file(f) for f in targets]
154
+ results = await asyncio.gather(*tasks)
155
+ repo_map_lines.extend(results)
156
+
157
+ if remaining:
158
+ repo_map_lines.append("\n--- Other Files ---")
159
+ if len(remaining) > 300:
160
+ repo_map_lines.extend(remaining[:300])
161
+ repo_map_lines.append(f"... ({len(remaining)-300} more files)")
162
+ else:
163
+ repo_map_lines.extend(remaining)
164
+
165
+ return "\n".join(repo_map_lines), mapped_files_set
166
+
167
+
168
+ async def agent_stream(repo_url: str, session_id: str, language: str = "en", regenerate_only: bool = False):
169
+ """
170
+ 主分析流程。
171
+
172
+ Args:
173
+ repo_url: GitHub 仓库 URL
174
+ session_id: 会话 ID
175
+ language: 报告语言 (zh/en)
176
+ regenerate_only: 如果为 True,跳过索引步骤,直接使用已有数据生成新语言报告
177
+ """
178
+ short_id = session_id[-6:] if session_id else "unknown"
179
+
180
+ # === 追踪初始化 ===
181
+ trace_id = tracing_service.start_trace(
182
+ trace_name="agent_analysis",
183
+ session_id=session_id,
184
+ metadata={"repo_url": repo_url, "language": language, "regenerate_only": regenerate_only}
185
+ )
186
+ start_time = time.time()
187
+
188
+ # === 检查是否有其他用户正在分析同一仓库 ===
189
+ if not regenerate_only:
190
+ if await RepoLock.is_locked(session_id):
191
+ yield json.dumps({
192
+ "step": "waiting",
193
+ "message": f"⏳ Another user is analyzing this repository. Please wait..."
194
+ })
195
+
196
+ # === 获取仓库锁 (仅写操作需要) ===
197
+ try:
198
+ async with RepoLock.acquire(session_id):
199
+ async for event in _agent_stream_inner(
200
+ repo_url, session_id, language, regenerate_only,
201
+ short_id, trace_id, start_time
202
+ ):
203
+ yield event
204
+ except TimeoutError as e:
205
+ yield json.dumps({
206
+ "step": "error",
207
+ "message": f"❌ {str(e)}. The repository is being analyzed by another user."
208
+ })
209
+
210
+
211
+ async def _agent_stream_inner(
212
+ repo_url: str, session_id: str, language: str, regenerate_only: bool,
213
+ short_id: str, trace_id: str, start_time: float
214
+ ):
215
+ """
216
+ 实际的分析流程 (在锁保护下执行)
217
+ """
218
+ try:
219
+ vector_db = store_manager.get_store(session_id)
220
+
221
+ # 调试日志:确认 session 隔离
222
+ print(f"🔍 [DEBUG] session_id: {session_id}, collection: {vector_db.collection_name}, context_file: {vector_db._context_file}")
223
+
224
+ # === regenerate_only 模式:跳过索引,直接生成报告 ===
225
+ if regenerate_only:
226
+ yield json.dumps({"step": "init", "message": f"🔄 [Session: {short_id}] Regenerating report in {language}..."})
227
+ await asyncio.sleep(0.3)
228
+
229
+ # 从已有索引加载上下文
230
+ context = vector_db.load_context()
231
+ if not context:
232
+ yield json.dumps({"step": "error", "message": "❌ No existing index found. Please analyze the repository first."})
233
+ return
234
+
235
+ # 正确读取 global_context 内的字段
236
+ global_ctx = context.get("global_context", {})
237
+ file_tree_str = global_ctx.get("file_tree", "")
238
+ context_summary = global_ctx.get("summary", "")
239
+ visited_files = set() # regenerate 模式不需要这个,但报告生成需要引用
240
+
241
+ # 验证上下文与请求的仓库匹配
242
+ stored_repo_url = context.get("repo_url", "")
243
+ if stored_repo_url and repo_url not in stored_repo_url and stored_repo_url not in repo_url:
244
+ print(f"⚠️ [WARNING] repo_url mismatch! Request: {repo_url}, Stored: {stored_repo_url}")
245
+
246
+ yield json.dumps({"step": "generating", "message": f"📝 Generating report in {'Chinese' if language == 'zh' else 'English'}..."})
247
+ else:
248
+ # === 正常分析模式 ===
249
+ yield json.dumps({"step": "init", "message": f"🚀 [Session: {short_id}] Connecting to GitHub..."})
250
+ await asyncio.sleep(0.5)
251
+
252
+ await vector_db.reset() # 使用异步方法
253
+
254
+ chunker = UniversalChunker(config=ChunkingConfig(min_chunk_size=50))
255
+
256
+ file_list = await get_repo_structure(repo_url)
257
+ if not file_list:
258
+ raise Exception("Repository is empty or unreadable.")
259
+
260
+ yield json.dumps({"step": "fetched", "message": f"📦 Found {len(file_list)} files. Building Repo Map (AST Parsing)..."})
261
+
262
+ # === 接收 mapped_files 用于后续查重 + 计时 ===
263
+ map_start = time.time()
264
+ file_tree_str, mapped_files = await generate_repo_map(repo_url, file_list, limit=agent_config.initial_map_limit)
265
+ map_latency_ms = (time.time() - map_start) * 1000
266
+ tracing_service.add_event("repo_map_generated", {"latency_ms": map_latency_ms, "files_mapped": len(mapped_files)})
267
+
268
+ visited_files = set()
269
+ context_summary = ""
270
+ readme_file = next((f for f in file_list if f.lower().endswith("readme.md")), None)
271
+
272
+ for round_idx in range(agent_config.max_rounds):
273
+ yield json.dumps({"step": "thinking", "message": f"🕵️ [Round {round_idx+1}/{agent_config.max_rounds}] DeepSeek is analyzing Repo Map..."})
274
+
275
+ system_prompt = "You are a Senior Software Architect. Your goal is to understand the codebase."
276
+ user_content = f"""
277
+ [Project Repo Map]
278
+ (Contains file paths and key Class/Function signatures)
279
+ {file_tree_str}
280
+
281
+ [Files Already Read]
282
+ {list(visited_files)}
283
+
284
+ [Current Knowledge]
285
+ {context_summary}
286
+
287
+ [Task]
288
+ Select 1-{agent_config.files_per_round} MOST CRITICAL files to read next to understand the core logic.
289
+ Focus on files that seem to contain main logic based on the Repo Map symbols.
290
+
291
+ [Constraint]
292
+ Return ONLY a raw JSON list of strings. No markdown.
293
+ Example: ["src/main.py", "app/auth.py"]
294
+ """
295
+
296
+ if not client:
297
+ yield json.dumps({"step": "error", "message": "❌ LLM Client Not Initialized."})
298
+ return
299
+
300
+ # === Token & Latency Tracing ===
301
+ llm_start_time = time.time()
302
+ plan_messages = [
303
+ {"role": "system", "content": system_prompt},
304
+ {"role": "user", "content": user_content}
305
+ ]
306
+
307
+ response = await client.chat.completions.create(
308
+ model=settings.default_model_name,
309
+ messages=plan_messages,
310
+ temperature=0.1,
311
+ timeout=settings.LLM_TIMEOUT
312
+ )
313
+
314
+ llm_latency_ms = (time.time() - llm_start_time) * 1000
315
+ raw_content = response.choices[0].message.content
316
+
317
+ # 记录 Token 使用量
318
+ usage = getattr(response, 'usage', None)
319
+ tracing_service.record_llm_generation(
320
+ model=settings.default_model_name,
321
+ prompt_messages=plan_messages,
322
+ generated_text=raw_content,
323
+ total_latency_ms=llm_latency_ms,
324
+ prompt_tokens=usage.prompt_tokens if usage else None,
325
+ completion_tokens=usage.completion_tokens if usage else None,
326
+ total_tokens=usage.total_tokens if usage else None,
327
+ is_streaming=False,
328
+ metadata={"step": "file_selection", "round": round_idx + 1}
329
+ )
330
+ target_files = extract_json_from_text(raw_content)
331
+
332
+ valid_files = [f for f in target_files if f in file_list and f not in visited_files]
333
+
334
+ if round_idx == 0 and readme_file and readme_file not in visited_files and readme_file not in valid_files:
335
+ valid_files.insert(0, readme_file)
336
+
337
+ if not valid_files:
338
+ yield json.dumps({"step": "plan", "message": f"🛑 [Round {round_idx+1}] Sufficient context gathered."})
339
+ break
340
+
341
+ yield json.dumps({"step": "plan", "message": f"👉 [Round {round_idx+1}] Selected: {valid_files}"})
342
+
343
+ # === 并发模型缺陷优化 (并行下载处理) ===
344
+ async def process_single_file(file_path):
345
+ try:
346
+ file_start = time.time()
347
+
348
+ # 🔧 异步 GitHub API (已优化为非阻塞)
349
+ content = await get_file_content(repo_url, file_path)
350
+ if not content:
351
+ tracing_service.add_event("file_read_failed", {"file": file_path})
352
+ return None
353
+
354
+ # 1. 摘要与 Context
355
+ lines = content.split('\n')[:50]
356
+ preview = "\n".join(lines)
357
+ file_knowledge = f"\n--- File: {file_path} ---\n{preview}\n"
358
+
359
+ # 2. Repo Map 增量更新与查重
360
+ new_map_entry = None
361
+ if file_path not in mapped_files:
362
+ symbols = await asyncio.to_thread(_extract_symbols, content, file_path)
363
+ if symbols:
364
+ new_map_entry = f"{file_path}\n" + "\n".join(symbols)
365
+
366
+ # 3. 切片与入库
367
+ chunks = await asyncio.to_thread(chunker.chunk_file, content, file_path)
368
+ if chunks:
369
+ documents = [c["content"] for c in chunks]
370
+ metadatas = []
371
+ for c in chunks:
372
+ meta = c["metadata"]
373
+ metadatas.append({
374
+ "file": meta["file"],
375
+ "type": meta["type"],
376
+ "name": meta.get("name", ""),
377
+ "class": meta.get("class") or ""
378
+ })
379
+ if documents:
380
+ try:
381
+ await vector_db.add_documents(documents, metadatas)
382
+ except Exception as e:
383
+ print(f"❌ 索引错误 {file_path}: {e}")
384
+ # 不中断,继续处理其他文件
385
+ return None
386
+
387
+ file_latency_ms = (time.time() - file_start) * 1000
388
+ tracing_service.add_event("file_processed", {
389
+ "file": file_path,
390
+ "latency_ms": file_latency_ms,
391
+ "chunks_count": len(chunks) if chunks else 0
392
+ })
393
+
394
+ return {
395
+ "path": file_path,
396
+ "knowledge": file_knowledge,
397
+ "map_entry": new_map_entry
398
+ }
399
+ except Exception as e:
400
+ print(f"❌ 处理文件错误 {file_path}: {e}")
401
+ return None
402
+
403
+ # 提示开始并发下载
404
+ yield json.dumps({"step": "download", "message": f"📥 Starting parallel download for {len(valid_files)} files..."})
405
+
406
+ # 启动并发任务 (return_exceptions=True 防止单个失败导致整个中断)
407
+ tasks = [process_single_file(f) for f in valid_files]
408
+ results = await asyncio.gather(*tasks, return_exceptions=True)
409
+
410
+ # 聚合结果
411
+ download_count = 0
412
+ for res in results:
413
+ if not res or isinstance(res, Exception):
414
+ if isinstance(res, Exception):
415
+ print(f"❌ Task 异常: {res}")
416
+ continue
417
+ download_count += 1
418
+ visited_files.add(res["path"])
419
+ context_summary += res["knowledge"]
420
+
421
+ # 增量更新 Map
422
+ if res["map_entry"]:
423
+ file_tree_str = f"{res['map_entry']}\n\n{file_tree_str}"
424
+ mapped_files.add(res["path"])
425
+
426
+ # === 硬编码截断解耦 ===
427
+ context_summary = context_summary[:agent_config.max_context_length]
428
+
429
+ global_context_data = {
430
+ "file_tree": file_tree_str,
431
+ "summary": context_summary[:8000]
432
+ }
433
+ await vector_db.save_context(repo_url, global_context_data)
434
+
435
+ yield json.dumps({"step": "indexing", "message": f"🧠 [Round {round_idx+1}] Processed {download_count} files. Knowledge graph updated."})
436
+
437
+ # Final Report (正常分析模式下的提示)
438
+ yield json.dumps({"step": "generating", "message": "📝 Generating technical report..."})
439
+
440
+ # === 报告生成 (两种模式共用) ===
441
+
442
+ # === P0: 向量检索补充关键代码片段 ===
443
+ yield json.dumps({"step": "enriching", "message": "🔍 Retrieving key code snippets..."})
444
+
445
+ key_queries = [
446
+ "main entry point initialization startup",
447
+ "core business logic handler processor",
448
+ "API routes endpoints controllers",
449
+ "database models schema ORM",
450
+ "authentication authorization middleware"
451
+ ]
452
+
453
+ retrieved_snippets = []
454
+ try:
455
+ await vector_db.initialize()
456
+ for query in key_queries:
457
+ results = await vector_db.search_hybrid(query, top_k=2)
458
+ for r in results:
459
+ snippet = r.get("content", "")[:400]
460
+ file_path = r.get("file", "unknown")
461
+ if snippet and snippet not in [s.split("]")[1] if "]" in s else s for s in retrieved_snippets]:
462
+ retrieved_snippets.append(f"[{file_path}]\n{snippet}")
463
+ except Exception as e:
464
+ print(f"⚠️ 向量检索失败: {e}")
465
+
466
+ code_snippets_section = "\n\n".join(retrieved_snippets[:8]) if retrieved_snippets else ""
467
+
468
+ # === P1: 依赖文件解析 ===
469
+ dep_files = ["requirements.txt", "pyproject.toml", "package.json", "go.mod", "Cargo.toml", "pom.xml", "build.gradle"]
470
+ dependencies_info = ""
471
+
472
+ # 获取 file_list(regenerate_only 模式下需要重新获取)
473
+ if regenerate_only:
474
+ try:
475
+ temp_file_list = await get_repo_structure(repo_url)
476
+ except:
477
+ temp_file_list = []
478
+ else:
479
+ temp_file_list = file_list if 'file_list' in dir() else []
480
+
481
+ for dep_file in dep_files:
482
+ matching = [f for f in temp_file_list if f.endswith(dep_file)]
483
+ for f in matching[:1]: # 只取第一个匹配
484
+ try:
485
+ content = await get_file_content(repo_url, f)
486
+ if content:
487
+ dependencies_info += f"\n[{f}]\n{content[:800]}\n"
488
+ except:
489
+ pass
490
+
491
+ # 构建增强的上下文
492
+ enhanced_context = f"""
493
+ {context_summary[:12000]}
494
+
495
+ [Key Code Snippets (Retrieved by Semantic Search)]
496
+ {code_snippets_section}
497
+
498
+ [Project Dependencies]
499
+ {dependencies_info if dependencies_info else "No dependency file found."}
500
+ """
501
+
502
+ repo_map_injection = f"""
503
+ [Project Repo Map (Structure)]
504
+ {file_tree_str}
505
+ """
506
+
507
+ # === 根据语言选择 Prompt ===
508
+ if language == "zh":
509
+ # --- 中文 Prompt ---
510
+ system_role = "你是一位务实的技术专家。目标是为开发者创建一个'3页纸'架构概览,让他们能在5分钟内看懂这个仓库。重点关注架构和数据流,不要纠结细节。"
511
+ analysis_user_content = f"""
512
+ [角色]
513
+ 你是一位务实的技术专家(Tech Lead)。
514
+
515
+ [输入数据]
516
+ {repo_map_injection}
517
+
518
+ 分析的文件: {list(visited_files)}
519
+
520
+ [代码知识库与关键片段]
521
+ {enhanced_context}
522
+
523
+ [严格限制]
524
+ 1. **不进行代码审查**: 不要列出 Bug、缺失功能或改进建议。
525
+ 2. **不评价**: 不要评价代码质量,只描述它**如何工作**。
526
+ 3. **语调**: 专业、结构化、描述性。使用中文回答。
527
+ 4. **不要废话**: 不要写"安全性"、"未来规划"等未请求的章节。
528
+
529
+ [输出格式要求 (Markdown)]
530
+
531
+ # 项目分析报告
532
+
533
+ ## 1. 执行摘要 (Executive Summary)
534
+ - **用途**: (这个项目具体解决什么问题?1-2句话)
535
+ - **核心功能**: (列出Top 3功能点)
536
+ - **技术栈**: (语言、框架、数据库、关键库)
537
+
538
+ ## 2. 系统架构 (Mermaid)
539
+ 创建一个 `graph TD` 图。
540
+ - 展示高层组件 (如 Client, API Server, Database, Worker, External Service)。
541
+ - 在连线上标注数据流 (如 "HTTP", "SQL")。
542
+ - **风格**: 保持概念清晰简单,节点数量控制在 8 个以内。
543
+
544
+ **⚠️ Mermaid 语法严格要求 (v10.x)**:
545
+ 1. **所有节点文本必须用双引号包裹**: `A["用户界面"]` ✓, `A[用户界面]` ✗
546
+ 2. **所有连线标签必须用双引号包裹**: `-->|"HTTP请求"|` ✓, `-->|HTTP请求|` ✗
547
+ 3. **禁止使用特殊字符**: 不要在文本中使用 `<br/>`, `/`, `(`, `)`, `&`, `<`, `>` 等
548
+ 4. **使用简短英文ID**: 节点ID用简短英文如 `A`, `B`, `Client`, `API`
549
+ 5. **subgraph 标题也需引号**: `subgraph "核心服务"` ✓
550
+ 6. **数据库节点**: 使用 `[("数据库")]` 格式
551
+
552
+ - **正确示例**:
553
+ ```mermaid
554
+ graph TD
555
+ Client["客户端"] -->|"HTTP请求"| API["API网关"]
556
+ API --> Service["业务服务"]
557
+ Service --> DB[("数据库")]
558
+ Service -->|"调用"| External["外部服务"]
559
+ ```
560
+
561
+ ## 3. 核心逻辑分析 (Table)
562
+ (总结关键模块,不要列出所有文件,只列最重要的)
563
+
564
+ | 组件/文件 | 职责 (它做什么?) | 关键设计模式/逻辑 |
565
+ | :--- | :--- | :--- |
566
+ | 例如 `auth_service.py` | 处理JWT颁发与验证 | 单例模式, 路由装饰器 |
567
+ | ... | ... | ... |
568
+
569
+ ## 4. 🔬 核心方法深度解析
570
+ (精选 3-5 个最关键的 `.py` 文件。针对每个文件,列出驱动逻辑的 Top 2-3 个方法)
571
+
572
+ ### 4.1 `[文件名]`
573
+ * **`[方法名]`**: [解释它做什么以及为什么重要,不要贴代码]
574
+ * **`[方法名]`**: [解释...]
575
+
576
+ ## 5. 主要工作流 (Mermaid)
577
+ 选择**一个最重要**的业务流程 (Happy Path)。
578
+ 创建一个 `sequenceDiagram`。
579
+ - 参与者应该是高层概念 (如 User, API, DB),不要用具体变量名。
580
+
581
+ **⚠️ sequenceDiagram 语法要求**:
582
+ 1. **participant 别名格式**: `participant API as "API服务"` ✓
583
+ 2. **消息文本用双引号**: `User->>API: "发起请求"` ✓
584
+ 3. **避免特殊字符**: 不要在消息中使用 `/`, `&`, `<`, `>` 等
585
+
586
+ - **正确示例**:
587
+ ```mermaid
588
+ sequenceDiagram
589
+ participant User as "用户"
590
+ participant API as "API服务"
591
+ participant DB as "数据库"
592
+ User->>API: "发起请求"
593
+ API->>DB: "查询数据"
594
+ DB-->>API: "返回结果"
595
+ API-->>User: "响应数据"
596
+ ```
597
+
598
+ ## 6. 快速开始 (Quick Start)
599
+ - **前置条件**: (如 Docker, Python 3.9+, .env 配置)
600
+ - **入口**: (如何启动主逻辑?如 `python main.py`)
601
+ """
602
+ else:
603
+ analysis_user_content = f"""
604
+ [Role]
605
+ You are a **Pragmatic Tech Lead**. Your goal is to create a **"3-Pages" Architecture Overview** for a developer who wants to understand this repo in 5 minutes.
606
+ [Input Data]
607
+ {repo_map_injection}
608
+
609
+ Files analyzed: {list(visited_files)}
610
+
611
+ [Code Knowledge & Key Snippets]
612
+ {enhanced_context}
613
+
614
+ [Strict Constraints]
615
+ 1. **NO Code Review**: Do NOT list bugs, issues, missing features, or recommendations.
616
+ 2. **NO Critique**: Do not judge the code quality. Focus on HOW it works.
617
+ 3. **Tone**: Professional, descriptive, and structural.
618
+ 4. **NO "FLUFF"**: Do NOT add unrequested sections like "Security", "Scalability", "Data Models", "Future Enhancements", etc.
619
+
620
+ [Required Output Format (Markdown)]
621
+
622
+ # Project Analysis Report
623
+
624
+ ## 1. Executive Summary
625
+ - **Purpose**: (What specific problem does this project solve? 1-2 sentences)
626
+ - **Key Features**: (Bullet points of top 3 features)
627
+ - **Tech Stack**: (List languages, frameworks, databases, and key libs)
628
+
629
+ ## 2. System Architecture
630
+ Create a `graph TD` diagram.
631
+ - Show high-level components (e.g., Client, API Server, Database, Worker, External Service).
632
+ - Label the edges with data flow (e.g., "HTTP", "SQL").
633
+ - **Style**: Keep it simple and conceptual. Limit to 8 nodes max.
634
+
635
+ **⚠️ Mermaid Syntax Rules (v10.x - MUST FOLLOW)**:
636
+ 1. **Wrap ALL node text in double quotes**: `A["User Client"]` ✓, `A[User Client]` ✗
637
+ 2. **Wrap ALL edge labels in double quotes**: `-->|"HTTP Request"|` ✓, `-->|HTTP Request|` ✗
638
+ 3. **NO special characters in text**: Avoid `/`, `()`, `&`, `<>`, `<br/>` in labels
639
+ 4. **Use short alphanumeric IDs**: e.g., `A`, `B`, `Client`, `API`, `DB`
640
+ 5. **Subgraph titles need quotes**: `subgraph "Core Services"` ✓
641
+ 6. **Database node format**: Use `[("Database")]` for cylinder shape
642
+
643
+ - **Correct Example**:
644
+ ```mermaid
645
+ graph TD
646
+ Client["User Client"] -->|"HTTP Request"| API["API Gateway"]
647
+ API --> Service["Business Service"]
648
+ Service --> DB[("Database")]
649
+ Service -->|"Calls"| External["External API"]
650
+ ```
651
+
652
+ ## 3. Core Logic Analysis
653
+ (Create a Markdown Table to summarize key modules. Do not list every file, only the most important ones.)
654
+
655
+ | Component/File | Responsibility (What does it do?) | Key Design Pattern / Logic |
656
+ | :--- | :--- | :--- |
657
+ | e.g. `auth_service.py` | Handles JWT issuance and verification | Singleton, Decorator for routes |
658
+ | ... | ... | ... |
659
+
660
+ ## 4. Core Methods Deep Dive
661
+ (Select the 3-5 most critical `.py` files. For each, list the top 2-3 methods that drive the logic.)
662
+
663
+ ### 4.1 `[Filename, e.g., agent_service.py]`
664
+ * **`[Method Name]`**: [Explanation of what it does and why it matters. No code.]
665
+ * **`[Method Name]`**: [Explanation...]
666
+
667
+ ### 4.2 `[Filename, e.g., vector_service.py]`
668
+ * **`[Method Name]`**: [Explanation...]
669
+ * ...
670
+
671
+ ## 5. Main Workflow (Mermaid)
672
+ Select the **Single Most Important** business flow (The "Happy Path").
673
+ Create a `sequenceDiagram`.
674
+ - Participants should be high-level (e.g., User, API, DB), not specific variable names.
675
+
676
+ **⚠️ sequenceDiagram Syntax Rules**:
677
+ 1. **Wrap participant aliases in quotes**: `participant API as "API Server"` ✓
678
+ 2. **Wrap message text in quotes**: `User->>API: "Send Request"` ✓
679
+ 3. **NO special characters**: Avoid `/`, `&`, `<`, `>` in messages
680
+
681
+ - **Correct Example**:
682
+ ```mermaid
683
+ sequenceDiagram
684
+ participant User as "User"
685
+ participant API as "API Server"
686
+ participant DB as "Database"
687
+ User->>API: "Send Request"
688
+ API->>DB: "Query Data"
689
+ DB-->>API: "Return Result"
690
+ API-->>User: "Send Response"
691
+ ```
692
+
693
+ ## 6. Quick Start Guide
694
+ - **Prerequisites**: (e.g. Docker, Python 3.9+, .env file)
695
+ - **Entry Point**: (How to run the main logic? e.g. `python main.py` or `uvicorn`)
696
+
697
+ """
698
+
699
+ # === 增加 timeout 防止长文本生成时断连 ===
700
+ report_messages = [
701
+ {"role": "system", "content": "You are a pragmatic Tech Lead. Focus on architecture and data flow, not implementation details."},
702
+ {"role": "user", "content": analysis_user_content}
703
+ ]
704
+
705
+ stream_start_time = time.time()
706
+ stream = await client.chat.completions.create(
707
+ model=settings.default_model_name,
708
+ messages=report_messages,
709
+ stream=True,
710
+ timeout=settings.LLM_TIMEOUT # 使用统一配置
711
+ )
712
+
713
+ # === TTFT & Token Tracking ===
714
+ first_token_received = False
715
+ ttft_ms = None
716
+ generated_text = ""
717
+ completion_tokens_estimate = 0
718
+
719
+ # === 增加 try-except 捕获流式传输中断 ===
720
+ try:
721
+ async for chunk in stream:
722
+ if chunk.choices[0].delta.content:
723
+ content = chunk.choices[0].delta.content
724
+
725
+ # 记录 TTFT (首 Token 时间)
726
+ if not first_token_received:
727
+ ttft_ms = (time.time() - stream_start_time) * 1000
728
+ tracing_service.record_ttft(
729
+ ttft_ms=ttft_ms,
730
+ model=settings.default_model_name,
731
+ metadata={"step": "report_generation"}
732
+ )
733
+ first_token_received = True
734
+
735
+ generated_text += content
736
+ completion_tokens_estimate += 1 # 粗略估计每个 chunk 约 1 token
737
+ yield json.dumps({"step": "report_chunk", "chunk": content})
738
+ except (httpx.ReadError, httpx.ConnectError) as e:
739
+ yield json.dumps({"step": "error", "message": f"⚠️ Network Timeout during generation: {str(e)}"})
740
+ return
741
+
742
+ # 流结束后记录完整的 LLM 生成信息
743
+ total_latency_ms = (time.time() - stream_start_time) * 1000
744
+ tracing_service.record_llm_generation(
745
+ model=settings.default_model_name,
746
+ prompt_messages=report_messages,
747
+ generated_text=generated_text,
748
+ ttft_ms=ttft_ms,
749
+ total_latency_ms=total_latency_ms,
750
+ completion_tokens=completion_tokens_estimate,
751
+ is_streaming=True,
752
+ metadata={"step": "report_generation", "generated_chars": len(generated_text)}
753
+ )
754
+
755
+ # === 保存报告 (按语言存储,异步避免阻塞) ===
756
+ await vector_db.save_report(generated_text, language)
757
+
758
+ yield json.dumps({"step": "finish", "message": "✅ Analysis Complete!"})
759
+
760
+ except Exception as e:
761
+ # === 全局异常捕获 ===
762
+ import traceback
763
+ traceback.print_exc()
764
+
765
+ # 提取友好的错误信息
766
+ error_msg = str(e)
767
+ if "401" in error_msg:
768
+ ui_msg = "❌ GitHub Token Invalid. Please check your settings."
769
+ elif "403" in error_msg:
770
+ ui_msg = "❌ GitHub API Rate Limit Exceeded. Try again later or add a Token."
771
+ elif "404" in error_msg:
772
+ ui_msg = "❌ Repository Not Found. Check the URL."
773
+ elif "Timeout" in error_msg or "ConnectError" in error_msg:
774
+ ui_msg = "❌ Network Timeout. LLM or GitHub is not responding."
775
+ else:
776
+ ui_msg = f"💥 System Error: {error_msg}"
777
+
778
+ yield json.dumps({"step": "error", "message": ui_msg})
779
+ return # 终止流
app/services/auto_evaluation_service.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/services/auto_evaluation_service.py
2
+ """
3
+ 自动评估服务 - Phase 1
4
+ 在后台异步进行评估,不阻塞用户请求
5
+
6
+ 工作流程:
7
+ 1. 用户调用 /chat 或 /analyze
8
+ 2. 获得立即响应
9
+ 3. 后台异步执行评估
10
+ 4. 评估结果存储到 evaluation/sft_data/
11
+ """
12
+
13
+ import asyncio
14
+ import json
15
+ import os
16
+ from datetime import datetime
17
+ from typing import Optional
18
+ from dataclasses import dataclass
19
+
20
+ from evaluation.evaluation_framework import (
21
+ EvaluationEngine,
22
+ EvaluationResult,
23
+ DataRoutingEngine,
24
+ DataQualityTier
25
+ )
26
+ from evaluation.utils import is_chatty_query, has_code_indicators
27
+ from app.services.tracing_service import tracing_service
28
+
29
+
30
+ @dataclass
31
+ class EvaluationConfig:
32
+ """
33
+ 自动评估配置
34
+
35
+ 数据路由阈值说明(与 data_router.py 一致):
36
+ - score > 0.9 → Gold → positive_samples.jsonl
37
+ - score > 0.6 → Silver → positive_samples.jsonl
38
+ - score > 0.4 → Bronze → negative_samples.jsonl
39
+ - score <= 0.4 → Rejected → 不存储
40
+ """
41
+ enabled: bool = True # 是否启用自动评估
42
+ use_ragas: bool = False # 是否使用 Ragas 进行 sanity check
43
+ custom_weight: float = 0.7 # custom_eval 的权重
44
+ ragas_weight: float = 0.3 # ragas_eval 的权重
45
+ diff_threshold: float = 0.2 # 差异阈值(超过则标记 needs_review)
46
+ min_quality_score: float = 0.4 # 最低质量分数(<=0.4 才拒绝)
47
+ async_evaluation: bool = True # 是否异步执行(推荐 True)
48
+ min_query_length: int = 10 # 最小 query 长度
49
+ min_answer_length: int = 100 # 最小 answer 长度
50
+ require_repo_url: bool = True # 是否要求有仓库 URL
51
+ require_code_in_context: bool = True # 是否要求上下文包含代码
52
+
53
+
54
+ class AutoEvaluationService:
55
+ """自动评估服务"""
56
+
57
+ def __init__(
58
+ self,
59
+ eval_engine: EvaluationEngine,
60
+ data_router: DataRoutingEngine,
61
+ config: EvaluationConfig = None
62
+ ):
63
+ self.eval_engine = eval_engine
64
+ self.data_router = data_router
65
+ self.config = config or EvaluationConfig()
66
+ self.needs_review_queue: list = [] # 需要人工审查的样本队列
67
+ self._evaluated_keys: set = set() # 防重复评估(session_id:query_hash)
68
+
69
+ # 被过滤数据的记录文件
70
+ self.skipped_samples_file = "evaluation/sft_data/skipped_samples.jsonl"
71
+ os.makedirs(os.path.dirname(self.skipped_samples_file), exist_ok=True)
72
+
73
+ def _record_skipped(self, reason: str, query: str, session_id: str,
74
+ repo_url: str = "", context_len: int = 0, answer_len: int = 0) -> None:
75
+ """记录被跳过的样本(供日后分析)"""
76
+ record = {
77
+ "timestamp": datetime.now().isoformat(),
78
+ "reason": reason,
79
+ "session_id": session_id,
80
+ "query": query[:200] if query else "",
81
+ "repo_url": repo_url,
82
+ "context_length": context_len,
83
+ "answer_length": answer_len
84
+ }
85
+ try:
86
+ with open(self.skipped_samples_file, 'a', encoding='utf-8') as f:
87
+ f.write(json.dumps(record, ensure_ascii=False) + '\n')
88
+ except Exception as e:
89
+ print(f" ⚠️ 记录跳过样本失败: {e}")
90
+
91
+ def _validate_input(
92
+ self,
93
+ query: str,
94
+ retrieved_context: str,
95
+ generated_answer: str,
96
+ session_id: str,
97
+ repo_url: str
98
+ ) -> tuple[bool, Optional[str]]:
99
+ """
100
+ 验证输入是否满足评估条件
101
+
102
+ Returns:
103
+ (is_valid, skip_reason) - 如果有效返回 (True, None),否则返回 (False, reason)
104
+ """
105
+ context_len = len(retrieved_context) if retrieved_context else 0
106
+ answer_len = len(generated_answer) if generated_answer else 0
107
+
108
+ # Query 验证
109
+ if not query or not query.strip():
110
+ self._record_skipped("query_empty", query or "", session_id, repo_url, context_len, answer_len)
111
+ return False, "query 为空"
112
+
113
+ if len(query.strip()) < self.config.min_query_length:
114
+ self._record_skipped("query_too_short", query, session_id, repo_url, context_len, answer_len)
115
+ return False, f"query 太短 ({len(query)} < {self.config.min_query_length})"
116
+
117
+ if is_chatty_query(query):
118
+ self._record_skipped("chatty_query", query, session_id, repo_url, context_len, answer_len)
119
+ return False, f"闲聊/无效 query: {query[:30]}"
120
+
121
+ # Repo URL 验证
122
+ if self.config.require_repo_url and not repo_url:
123
+ self._record_skipped("missing_repo_url", query, session_id, repo_url, context_len, answer_len)
124
+ return False, "缺少 repo_url"
125
+
126
+ # Answer 验证
127
+ if not generated_answer or len(generated_answer.strip()) < self.config.min_answer_length:
128
+ self._record_skipped("answer_too_short", query, session_id, repo_url, context_len, answer_len)
129
+ return False, f"回答太短 ({answer_len} < {self.config.min_answer_length})"
130
+
131
+ # Context 验证
132
+ if self.config.require_code_in_context and not has_code_indicators(retrieved_context):
133
+ self._record_skipped("no_code_in_context", query, session_id, repo_url, context_len, answer_len)
134
+ return False, "上下文中未检测到代码"
135
+
136
+ return True, None
137
+
138
+ def _check_duplicate(self, query: str, session_id: str) -> bool:
139
+ """检查是否重复评估,返回 True 表示是重复的"""
140
+ import hashlib
141
+ query_hash = hashlib.md5(query.encode()).hexdigest()[:8]
142
+ eval_key = f"{session_id}:{query_hash}"
143
+
144
+ if eval_key in self._evaluated_keys:
145
+ return True
146
+
147
+ self._evaluated_keys.add(eval_key)
148
+
149
+ # 限制缓存大小,防止内存泄漏
150
+ if len(self._evaluated_keys) > 1000:
151
+ self._evaluated_keys = set(list(self._evaluated_keys)[-500:])
152
+
153
+ return False
154
+
155
+ async def auto_evaluate(
156
+ self,
157
+ query: str,
158
+ retrieved_context: str,
159
+ generated_answer: str,
160
+ session_id: str = "auto",
161
+ repo_url: str = "",
162
+ language: str = "en"
163
+ ) -> Optional[str]:
164
+ """
165
+ 自动评估单个查询-回答对
166
+
167
+ Returns:
168
+ 质量等级 (gold/silver/bronze/rejected/needs_review) 或 None
169
+ """
170
+ # 输入验证
171
+ is_valid, skip_reason = self._validate_input(
172
+ query, retrieved_context, generated_answer, session_id, repo_url
173
+ )
174
+ if not is_valid:
175
+ print(f" ⚠️ [AutoEval] 跳过: {skip_reason}")
176
+ return None
177
+
178
+ # 防重复评估
179
+ if self._check_duplicate(query, session_id):
180
+ print(f" ⏭️ [AutoEval] 跳过重复评估: {query[:30]}...")
181
+ return None
182
+
183
+ start_time = datetime.now()
184
+
185
+ try:
186
+ # Step 1: 自定义评估
187
+ print(f"📊 [AutoEval] 开始评估: {query[:50]}...")
188
+
189
+ custom_metrics = await self.eval_engine.evaluate_generation(
190
+ query=query,
191
+ retrieved_context=retrieved_context,
192
+ generated_answer=generated_answer
193
+ )
194
+ custom_score = custom_metrics.overall_score()
195
+
196
+ print(f" ✓ Custom Score: {custom_score:.3f}")
197
+ print(f" - Faithfulness: {custom_metrics.faithfulness:.3f}")
198
+ print(f" - Answer Relevance: {custom_metrics.answer_relevance:.3f}")
199
+ print(f" - Completeness: {custom_metrics.answer_completeness:.3f}")
200
+
201
+ # Step 2: Ragas Sanity Check (如果启用)
202
+ ragas_score = None
203
+ ragas_details = None
204
+
205
+ if self.config.use_ragas:
206
+ try:
207
+ ragas_score, ragas_details = await self._ragas_eval(
208
+ query=query,
209
+ context=retrieved_context,
210
+ answer=generated_answer
211
+ )
212
+ print(f" ✓ Ragas Score: {ragas_score:.3f}")
213
+ if ragas_details:
214
+ print(f" - {ragas_details}")
215
+ except Exception as e:
216
+ print(f" ⚠️ Ragas 评估失败: {e}")
217
+ # Ragas 失败不应该中断主流程
218
+
219
+ # ============================================================
220
+ # Step 3: 混合评估 + 异常检测
221
+ # ============================================================
222
+ final_score, quality_status = self._compute_final_score(
223
+ custom_score=custom_score,
224
+ ragas_score=ragas_score
225
+ )
226
+
227
+ print(f" ✓ Final Score: {final_score:.3f} | Status: {quality_status}")
228
+
229
+ # ============================================================
230
+ # Step 4: 构建评估结果并存储
231
+ # ============================================================
232
+ eval_result = EvaluationResult(
233
+ session_id=session_id,
234
+ query=query,
235
+ repo_url=repo_url,
236
+ timestamp=start_time,
237
+ language=language,
238
+ generation_metrics=custom_metrics,
239
+ notes=f"ragas_score={ragas_score:.3f}" if ragas_score else ""
240
+ )
241
+
242
+ # 设置综合得分
243
+ eval_result.overall_score = final_score
244
+
245
+ # 根据状态和得分确定质量等级
246
+ print(f" [DEBUG] quality_status={quality_status}, final_score={final_score:.3f}, threshold={self.config.min_quality_score}")
247
+
248
+ if quality_status == "needs_review":
249
+ eval_result.data_quality_tier = DataQualityTier.BRONZE
250
+ eval_result.notes += " | needs_review=true"
251
+ # 加入审查队列
252
+ self.needs_review_queue.append({
253
+ "eval_result": eval_result,
254
+ "custom_score": custom_score,
255
+ "ragas_score": ragas_score,
256
+ "diff": abs(custom_score - (ragas_score or custom_score)),
257
+ "timestamp": start_time.isoformat()
258
+ })
259
+ print(f" ⚠️ 需要人工审查 (needs_review),暂存队列")
260
+ # 同时也路由到数据存储,便于后续分析
261
+ self.data_router.route_sample(eval_result)
262
+ elif final_score > self.config.min_quality_score:
263
+ # score > 0.4: 路由到 positive (>0.6) 或 negative (0.4-0.6)
264
+ print(f" ✓ 路由到 data_router (score {final_score:.2f} > {self.config.min_quality_score})")
265
+ self.data_router.route_sample(eval_result)
266
+ else:
267
+ # score <= 0.4: 质量太差,直接拒绝
268
+ eval_result.data_quality_tier = DataQualityTier.REJECTED
269
+ print(f" ❌ 评分过低 ({final_score:.2f} <= {self.config.min_quality_score}),拒绝存储")
270
+
271
+ # 记录到 tracing
272
+ tracing_service.add_event("auto_evaluation_completed", {
273
+ "query": query[:100],
274
+ "custom_score": custom_score,
275
+ "ragas_score": ragas_score,
276
+ "final_score": final_score,
277
+ "status": quality_status,
278
+ "quality_tier": eval_result.data_quality_tier.value
279
+ })
280
+
281
+ print(f" ✅ 评估完成\n")
282
+
283
+ return eval_result.data_quality_tier.value
284
+
285
+ except Exception as e:
286
+ print(f" ❌ 自动评估异常: {e}")
287
+ import traceback
288
+ traceback.print_exc()
289
+ return None
290
+
291
+ async def auto_evaluate_async(
292
+ self,
293
+ query: str,
294
+ retrieved_context: str,
295
+ generated_answer: str,
296
+ session_id: str = "auto",
297
+ repo_url: str = "",
298
+ language: str = "en"
299
+ ) -> None:
300
+ """
301
+ 异步版本 - 不阻塞主流程
302
+
303
+ 在后台执行评估,不等待结果
304
+ """
305
+ if not self.config.async_evaluation:
306
+ # 同步模式(不推荐在生产环境)
307
+ await self.auto_evaluate(
308
+ query=query,
309
+ retrieved_context=retrieved_context,
310
+ generated_answer=generated_answer,
311
+ session_id=session_id,
312
+ repo_url=repo_url,
313
+ language=language
314
+ )
315
+ else:
316
+ # 异步模式(推荐)- 在后台执行
317
+ asyncio.create_task(
318
+ self._eval_task(
319
+ query=query,
320
+ retrieved_context=retrieved_context,
321
+ generated_answer=generated_answer,
322
+ session_id=session_id,
323
+ repo_url=repo_url,
324
+ language=language
325
+ )
326
+ )
327
+
328
+ async def _eval_task(
329
+ self,
330
+ query: str,
331
+ retrieved_context: str,
332
+ generated_answer: str,
333
+ session_id: str,
334
+ repo_url: str,
335
+ language: str
336
+ ) -> None:
337
+ """后台评估任务包装"""
338
+ try:
339
+ await asyncio.sleep(0.1) # 让用户请求先返回
340
+ await self.auto_evaluate(
341
+ query=query,
342
+ retrieved_context=retrieved_context,
343
+ generated_answer=generated_answer,
344
+ session_id=session_id,
345
+ repo_url=repo_url,
346
+ language=language
347
+ )
348
+ except Exception as e:
349
+ print(f"❌ Background eval task failed: {e}")
350
+
351
+ def _compute_final_score(
352
+ self,
353
+ custom_score: float,
354
+ ragas_score: Optional[float]
355
+ ) -> tuple[float, str]:
356
+ """
357
+ 计算最终得分和状态
358
+
359
+ Returns:
360
+ (final_score, status)
361
+ status: "normal" / "needs_review" / "high_confidence"
362
+ """
363
+
364
+ if ragas_score is None:
365
+ # 没有 Ragas 分数,直接用 custom 分数
366
+ return custom_score, "normal"
367
+
368
+ # 计算差异
369
+ diff = abs(custom_score - ragas_score)
370
+
371
+ # 判断异常
372
+ if diff > self.config.diff_threshold:
373
+ # 差异过大,标记为需要审查
374
+ return custom_score, "needs_review"
375
+
376
+ # 混合评分
377
+ final_score = (
378
+ self.config.custom_weight * custom_score +
379
+ self.config.ragas_weight * ragas_score
380
+ )
381
+
382
+ # 两者都高分 → 高置信度
383
+ if custom_score > 0.75 and ragas_score > 0.75:
384
+ status = "high_confidence"
385
+ else:
386
+ status = "normal"
387
+
388
+ return final_score, status
389
+
390
+ async def _ragas_eval(
391
+ self,
392
+ query: str,
393
+ context: str,
394
+ answer: str
395
+ ) -> tuple[Optional[float], Optional[str]]:
396
+ """
397
+ 使用 Ragas 进行 sanity check
398
+
399
+ Returns:
400
+ (score, details)
401
+ """
402
+ try:
403
+ from ragas.metrics import faithfulness, answer_relevancy
404
+ from ragas import evaluate
405
+
406
+ # 构造 Ragas 数据集
407
+ dataset_dict = {
408
+ "question": [query],
409
+ "contexts": [[context]],
410
+ "answer": [answer]
411
+ }
412
+
413
+ # 执行评估
414
+ result = evaluate(
415
+ dataset=dataset_dict,
416
+ metrics=[faithfulness, answer_relevancy]
417
+ )
418
+
419
+ # 提取分数
420
+ faithfulness_score = result["faithfulness"][0] if "faithfulness" in result else 0.5
421
+ relevancy_score = result["answer_relevancy"][0] if "answer_relevancy" in result else 0.5
422
+
423
+ # 平均得分
424
+ ragas_score = (faithfulness_score + relevancy_score) / 2
425
+
426
+ details = f"Ragas: faithfulness={faithfulness_score:.3f}, relevancy={relevancy_score:.3f}"
427
+
428
+ return ragas_score, details
429
+
430
+ except ImportError:
431
+ print("⚠️ Ragas 未安装,跳过 sanity check")
432
+ return None, None
433
+ except Exception as e:
434
+ print(f"⚠️ Ragas 评估异常: {e}")
435
+ return None, None
436
+
437
+ def get_review_queue(self) -> list:
438
+ """获取需要审查的样本列表"""
439
+ return self.needs_review_queue
440
+
441
+ def clear_review_queue(self) -> None:
442
+ """清空审查队列"""
443
+ self.needs_review_queue.clear()
444
+
445
+ def approve_sample(self, index: int) -> None:
446
+ """人工批准某个样本"""
447
+ if 0 <= index < len(self.needs_review_queue):
448
+ item = self.needs_review_queue[index]
449
+ # 直接存储到评估结果
450
+ self.data_router.route_sample(item["eval_result"])
451
+ print(f"✅ 样本 {index} 已批准")
452
+
453
+ def reject_sample(self, index: int) -> None:
454
+ """人工拒绝某个样本"""
455
+ if 0 <= index < len(self.needs_review_queue):
456
+ print(f"❌ 样本 {index} 已拒绝")
457
+ self.needs_review_queue.pop(index)
458
+
459
+
460
+ # 全局实例
461
+ auto_eval_service: Optional[AutoEvaluationService] = None
462
+
463
+
464
+ def init_auto_evaluation_service(
465
+ eval_engine: EvaluationEngine,
466
+ data_router: DataRoutingEngine,
467
+ config: EvaluationConfig = None
468
+ ) -> AutoEvaluationService:
469
+ """初始化自动评估服务"""
470
+ global auto_eval_service
471
+ auto_eval_service = AutoEvaluationService(
472
+ eval_engine=eval_engine,
473
+ data_router=data_router,
474
+ config=config
475
+ )
476
+ return auto_eval_service
477
+
478
+
479
+ def get_auto_evaluation_service() -> Optional[AutoEvaluationService]:
480
+ """获取自动评估服务实例"""
481
+ return auto_eval_service
app/services/chat_service.py ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/services/chat_service.py
2
+ import json
3
+ import asyncio
4
+ import re
5
+ import time
6
+ from dataclasses import dataclass, field
7
+ from typing import Dict, Optional, AsyncGenerator, List, Set
8
+ from app.core.config import settings
9
+ from app.utils.llm_client import client
10
+ from app.services.vector_service import store_manager
11
+ from app.services.github_service import get_file_content
12
+ from app.services.chunking_service import UniversalChunker, ChunkingConfig
13
+ from app.services.tracing_service import tracing_service
14
+ from app.utils.session import get_conversation_memory, ConversationMemory
15
+
16
+
17
+ # ============================================================
18
+ # 配置类 - 解耦所有可调参数
19
+ # ============================================================
20
+
21
+ @dataclass
22
+ class ChatConfig:
23
+ """Chat 服务配置 - 集中管理所有参数"""
24
+ # JIT 动态加载配置
25
+ max_jit_rounds: int = 2 # 最大 JIT 轮数
26
+ max_files_per_round: int = 3 # 每轮最多加载文件数
27
+
28
+ # LLM 配置
29
+ temperature_thinking: float = 0.1 # 思考阶段温度
30
+ temperature_final: float = 0.2 # 最终回答温度
31
+ max_tokens: int = 4096 # 最大 token 数
32
+
33
+ # 检索配置
34
+ retrieval_top_k: int = 6 # RAG 检索 top-k
35
+ context_max_chars: int = 2000 # 单文档最大字符数
36
+
37
+ # 对话上下文配置
38
+ max_history_turns: int = 6 # 保留最近 N 轮对话
39
+ summary_threshold: int = 10 # 超过 N 轮开始压缩
40
+
41
+ # 调试配置
42
+ show_debug_info: bool = False # 是否显示调试信息
43
+
44
+
45
+ # 全局配置实例
46
+ chat_config = ChatConfig()
47
+
48
+
49
+ @dataclass
50
+ class ChatResult:
51
+ """聊天结果 - 用于后续自动评估"""
52
+ answer: str # 最终回答
53
+ retrieved_context: str # 检索到的上下文
54
+ generation_latency_ms: float # 生成耗时
55
+ retrieval_latency_ms: float = 0 # 检索耗时
56
+
57
+
58
+ # === 评估数据存储 (供 main.py 获取) ===
59
+ # 存储每个 session 的评估数据,key 为 session_id
60
+ _eval_data_store: Dict[str, ChatResult] = {}
61
+
62
+ def get_eval_data(session_id: str) -> Optional[ChatResult]:
63
+ """获取指定 session 的评估数据"""
64
+ return _eval_data_store.get(session_id)
65
+
66
+ def clear_eval_data(session_id: str) -> None:
67
+ """清除指定 session 的评估数据"""
68
+ if session_id in _eval_data_store:
69
+ del _eval_data_store[session_id]
70
+
71
+
72
+ # [Fix 2] 使用 Config 对象初始化,而非直接传参
73
+ # 之前的写法: chunker = UniversalChunker(min_chunk_size=100)
74
+ # 现在的写法:
75
+ chunker = UniversalChunker(config=ChunkingConfig(min_chunk_size=100))
76
+
77
+ # === 新增:简单的中文检测 ===
78
+ def is_chinese_query(text: str) -> bool:
79
+ """检测字符串中是否包含中文字符"""
80
+ for char in text:
81
+ if '\u4e00' <= char <= '\u9fff':
82
+ return True
83
+ return False
84
+
85
+ # === 优化 2:查询重写 (解决中英文检索不匹配问题) ===
86
+ async def _rewrite_query(user_query: str):
87
+ """
88
+ 使用 LLM 将用户的自然语言(可能是中文)转换为 3-5 个代码搜索关键词(英文)。
89
+ """
90
+ prompt = f"""
91
+ You are a Code Search Expert.
92
+ Task: Convert the user's query into 3-5 English keywords for code search (BM25/Vector).
93
+
94
+ User Query: "{user_query}"
95
+
96
+ Rules:
97
+ 1. Output ONLY a JSON list of strings.
98
+ 2. Translate concepts to technical terms (e.g., "鉴权" -> "auth", "login", "middleware").
99
+ 3. Keep it short.
100
+
101
+ Example Output: ["authentication", "login_handler", "jwt_verify"]
102
+ """
103
+ try:
104
+ response = await client.chat.completions.create(
105
+ model=settings.default_model_name,
106
+ messages=[{"role": "user", "content": prompt}],
107
+ temperature=0.1,
108
+ max_tokens=100
109
+ )
110
+ content = response.choices[0].message.content
111
+ # 简单清洗
112
+ content = re.sub(r"^```(json)?|```$", "", content.strip(), flags=re.MULTILINE).strip()
113
+ keywords = json.loads(content)
114
+ if isinstance(keywords, list):
115
+ return " ".join(keywords) # 返回空格分隔的字符串供 BM25 使用
116
+ return user_query
117
+ except Exception as e:
118
+ print(f"⚠️ Query Rewrite Failed: {e}")
119
+ return user_query # 降级:直接用原句
120
+
121
+ async def process_chat_stream(user_query: str, session_id: str):
122
+ """
123
+ 处理聊天流 - 支持多轮 JIT 动态加载文件 + 对话上下文记忆
124
+
125
+ 流程:
126
+ 1. 获取对话记忆,构建上下文
127
+ 2. 初始检索 RAG 上下文
128
+ 3. LLM 思考并回答,可能请求文件
129
+ 4. 如果请求文件,加载后继续对话 (最多 max_jit_rounds 轮)
130
+ 5. 最终生成答案并保存到对话记忆
131
+ """
132
+ vector_db = store_manager.get_store(session_id)
133
+ cfg = chat_config # 使用全局配置
134
+
135
+ # === 获取对话记忆 ===
136
+ memory = get_conversation_memory(session_id)
137
+ memory.add_user_message(user_query) # 立即记录用户消息
138
+
139
+ # 检查是否需要摘要压缩
140
+ if memory.needs_summarization():
141
+ yield "> 📝 *Compressing conversation history...*\n\n"
142
+ await _compress_conversation_history(memory)
143
+
144
+ # === 评估数据收集变量 ===
145
+ collected_context = ""
146
+ collected_response = ""
147
+ collected_retrieval_latency = 0.0
148
+ collected_generation_latency = 0.0
149
+
150
+ # === JIT 状态跟踪 ===
151
+ all_loaded_files: Set[str] = set() # 所有已加载的文件
152
+ all_failed_files: Set[str] = set() # 所有失败的文件
153
+ jit_round = 0 # 当前 JIT 轮数
154
+
155
+ # === 语言环境检测 ===
156
+ use_chinese = is_chinese_query(user_query)
157
+
158
+ # UI 提示语
159
+ ui_msgs = _get_ui_messages(use_chinese)
160
+
161
+ # === 步骤 0: 查询重写 ===
162
+ search_query = await _rewrite_query(user_query)
163
+ yield f"{ui_msgs['thinking']}`{search_query}`...\n\n"
164
+
165
+ # === 步骤 1: 初始 RAG 检索 ===
166
+ retrieval_start = time.time()
167
+ relevant_docs = await vector_db.search_hybrid(search_query, top_k=cfg.retrieval_top_k)
168
+ retrieval_latency_ms = (time.time() - retrieval_start) * 1000
169
+ collected_retrieval_latency = retrieval_latency_ms
170
+ tracing_service.add_event("retrieval_completed", {
171
+ "latency_ms": retrieval_latency_ms,
172
+ "documents_retrieved": len(relevant_docs) if relevant_docs else 0
173
+ })
174
+
175
+ rag_context = _build_context(relevant_docs, cfg.context_max_chars)
176
+ collected_context = rag_context
177
+
178
+ # === 步骤 2: 构建初始 Prompt ===
179
+ global_context = vector_db.global_context or {}
180
+ file_tree = global_context.get("file_tree", "(File tree not available.)")
181
+ agent_summary = global_context.get("summary", "")
182
+
183
+ # 获取对话历史上下文
184
+ conversation_context = _build_conversation_context(memory)
185
+
186
+ system_instruction = _build_system_prompt(
187
+ file_tree=file_tree,
188
+ agent_summary=agent_summary,
189
+ rag_context=rag_context,
190
+ use_chinese=use_chinese,
191
+ is_final_round=False,
192
+ conversation_context=conversation_context
193
+ )
194
+
195
+ augmented_user_query = f"""
196
+ {user_query}
197
+
198
+ (System Note: Priority 1: Answer using context. Priority 2: Use <tool_code> ONLY if critical info is missing.)
199
+ """
200
+
201
+ if not client:
202
+ yield "❌ LLM Error: Client not initialized"
203
+ return
204
+
205
+ # 初始化对话历史
206
+ messages = [
207
+ {"role": "system", "content": system_instruction},
208
+ {"role": "user", "content": augmented_user_query}
209
+ ]
210
+
211
+ try:
212
+ generation_start = time.time()
213
+
214
+ # === 多轮 JIT 循环 ===
215
+ while jit_round <= cfg.max_jit_rounds:
216
+ is_final_round = (jit_round == cfg.max_jit_rounds)
217
+
218
+ # 如果是最终轮,更新系统提示禁用工具
219
+ if is_final_round and jit_round > 0:
220
+ # 更新系统消息,告知这是最后一轮
221
+ messages[0] = {"role": "system", "content": _build_system_prompt(
222
+ file_tree=file_tree,
223
+ agent_summary=agent_summary,
224
+ rag_context=collected_context,
225
+ use_chinese=use_chinese,
226
+ is_final_round=True,
227
+ failed_files=list(all_failed_files)
228
+ )}
229
+
230
+ # LLM 流式生成
231
+ stream = await client.chat.completions.create(
232
+ model=settings.default_model_name,
233
+ messages=messages,
234
+ stream=True,
235
+ temperature=cfg.temperature_final if is_final_round else cfg.temperature_thinking,
236
+ max_tokens=cfg.max_tokens
237
+ )
238
+
239
+ buffer = ""
240
+ round_response = ""
241
+ requested_files: Set[str] = set()
242
+
243
+ async for chunk in stream:
244
+ content = chunk.choices[0].delta.content or ""
245
+ if not content:
246
+ continue
247
+
248
+ buffer += content
249
+ round_response += content
250
+ collected_response += content
251
+
252
+ # 检测 tool_code 标签
253
+ if "</tool_code>" in buffer:
254
+ matches = re.findall(r"<tool_code>\s*(.*?)\s*</tool_code>", buffer, re.DOTALL)
255
+ for f in matches:
256
+ clean_f = f.strip().replace("'", "").replace('"', "").replace("`", "")
257
+ # 过滤已加载和已失败的文件
258
+ if clean_f and clean_f not in all_loaded_files and clean_f not in all_failed_files:
259
+ requested_files.add(clean_f)
260
+ yield content
261
+ buffer = ""
262
+ else:
263
+ yield content
264
+
265
+ # 处理缓冲区残留
266
+ if "</tool_code>" in buffer:
267
+ matches = re.findall(r"<tool_code>\s*(.*?)\s*</tool_code>", buffer, re.DOTALL)
268
+ for f in matches:
269
+ clean_f = f.strip().replace("'", "").replace('"', "").replace("`", "")
270
+ if clean_f and clean_f not in all_loaded_files and clean_f not in all_failed_files:
271
+ requested_files.add(clean_f)
272
+
273
+ # === 判断是否需要继续 JIT ===
274
+ if not requested_files or is_final_round:
275
+ # 没有新文件请求,或已达最大轮数,结束循环
276
+ break
277
+
278
+ # === JIT 文件加载 ===
279
+ jit_round += 1
280
+
281
+ # 限制每轮文件数
282
+ files_to_load = list(requested_files)[:cfg.max_files_per_round]
283
+ file_list_str = ", ".join([f"`{f}`" for f in files_to_load])
284
+
285
+ yield f"\n\n> 🔍 **[JIT Round {jit_round}/{cfg.max_jit_rounds}]** {ui_msgs['action_short']}{file_list_str}...\n\n"
286
+
287
+ if not vector_db.repo_url:
288
+ yield ui_msgs['error_url']
289
+ break
290
+
291
+ # 加载文件
292
+ round_loaded_docs = []
293
+ round_failed_files = []
294
+
295
+ for file_path in files_to_load:
296
+ if file_path in vector_db.indexed_files:
297
+ docs = vector_db.get_documents_by_file(file_path)
298
+ round_loaded_docs.extend(docs)
299
+ all_loaded_files.add(file_path)
300
+ yield f"> ✅ Loaded: `{file_path}`\n"
301
+ else:
302
+ success = await _download_and_index(vector_db, file_path)
303
+ if success:
304
+ docs = vector_db.get_documents_by_file(file_path)
305
+ round_loaded_docs.extend(docs)
306
+ all_loaded_files.add(file_path)
307
+ yield f"> ✅ Downloaded: `{file_path}`\n"
308
+ else:
309
+ round_failed_files.append(file_path)
310
+ all_failed_files.add(file_path)
311
+ yield f"> ⚠️ Failed: `{file_path}`\n"
312
+
313
+ # 构建后续消息
314
+ if round_loaded_docs:
315
+ new_context = _build_context(round_loaded_docs, cfg.context_max_chars)
316
+ collected_context += f"\n\n[JIT Round {jit_round} Context]\n{new_context}"
317
+
318
+ # 构建状态消息
319
+ status_msg = _build_jit_status_message(
320
+ loaded_count=len(round_loaded_docs),
321
+ failed_files=round_failed_files,
322
+ remaining_rounds=cfg.max_jit_rounds - jit_round,
323
+ use_chinese=use_chinese
324
+ )
325
+
326
+ context_section = f"\n\n[New Code Context]\n{_build_context(round_loaded_docs, cfg.context_max_chars)}" if round_loaded_docs else ""
327
+
328
+ # 更新对话历史,继续对话
329
+ messages.append({"role": "assistant", "content": round_response})
330
+ messages.append({"role": "user", "content": f"{status_msg}{context_section}\n\nPlease continue your analysis."})
331
+
332
+ yield "\n\n" # 分隔符
333
+
334
+ # === 生成完成 ===
335
+ generation_latency_ms = (time.time() - generation_start) * 1000
336
+ collected_generation_latency = generation_latency_ms
337
+
338
+ tracing_service.add_event("generation_completed", {
339
+ "latency_ms": generation_latency_ms,
340
+ "jit_rounds": jit_round,
341
+ "files_loaded": len(all_loaded_files),
342
+ "files_failed": len(all_failed_files)
343
+ })
344
+
345
+ # === 保存助手回复到对话记忆 ===
346
+ memory.add_assistant_message(collected_response)
347
+
348
+ # 存储评估数据
349
+ _eval_data_store[session_id] = ChatResult(
350
+ answer=collected_response,
351
+ retrieved_context=collected_context,
352
+ generation_latency_ms=collected_generation_latency,
353
+ retrieval_latency_ms=collected_retrieval_latency
354
+ )
355
+ print(f"📦 [EvalData] Session {session_id}: {len(collected_context)} chars context, {len(collected_response)} chars answer, {jit_round} JIT rounds, {memory.get_turn_count()} turns")
356
+
357
+ except Exception as e:
358
+ import traceback
359
+ traceback.print_exc()
360
+ error_msg = str(e)
361
+ # 即使出错也保存部分回复
362
+ if collected_response:
363
+ memory.add_assistant_message(collected_response + f"\n\n[Error: {error_msg}]")
364
+ tracing_service.add_event("generation_error", {
365
+ "error": error_msg,
366
+ "error_type": type(e).__name__,
367
+ "jit_round": jit_round
368
+ })
369
+ yield f"\n\n❌ System Error: {error_msg}"
370
+
371
+
372
+ # ============================================================
373
+ # 辅助函数
374
+ # ============================================================
375
+
376
+ def _get_ui_messages(use_chinese: bool) -> Dict[str, str]:
377
+ """获取 UI 消息(根据语言)"""
378
+ if use_chinese:
379
+ return {
380
+ "thinking": "> 🧠 **思考中:** 正在检索相关代码: ",
381
+ "action_short": "正在读取文件: ",
382
+ "error_url": "> ⚠️ 错误: 仓库链接丢失。\n",
383
+ }
384
+ else:
385
+ return {
386
+ "thinking": "> 🧠 **Thinking:** Searching for code related to: ",
387
+ "action_short": "Retrieving files: ",
388
+ "error_url": "> ⚠️ Error: Repository URL lost.\n",
389
+ }
390
+
391
+
392
+ def _build_system_prompt(
393
+ file_tree: str,
394
+ agent_summary: str,
395
+ rag_context: str,
396
+ use_chinese: bool,
397
+ is_final_round: bool,
398
+ failed_files: List[str] = None,
399
+ conversation_context: str = ""
400
+ ) -> str:
401
+ """构建系统提示词"""
402
+ lang_instruction = (
403
+ "IMPORTANT: The user is asking in Chinese. You MUST reply in Simplified Chinese (简体中文)."
404
+ if use_chinese else "Reply in English."
405
+ )
406
+
407
+ if is_final_round:
408
+ tool_instruction = """
409
+ [INSTRUCTIONS - FINAL ROUND]
410
+ This is your FINAL response. You MUST provide a complete answer NOW.
411
+ - DO NOT request any more files
412
+ - DO NOT use <tool_code> tags
413
+ - Synthesize all available context and give your best answer
414
+ - If some files were not accessible, explain what information is missing and provide the best possible answer with what you have
415
+ """
416
+ if failed_files:
417
+ tool_instruction += f"\n Note: The following files could not be accessed: {', '.join(failed_files)}"
418
+ else:
419
+ tool_instruction = """
420
+ [INSTRUCTIONS]
421
+ 1. **CHECK CONTEXT FIRST**: Look at the [Current Code Context]. Does it contain the answer?
422
+ 2. **IF YES**: Answer directly. DO NOT use tools.
423
+ 3. **IF NO**: Request missing files using tags: <tool_code>path/to/file</tool_code>
424
+ """
425
+
426
+ # 添加对话历史上下文
427
+ conversation_section = ""
428
+ if conversation_context:
429
+ conversation_section = f"""
430
+ [Previous Conversation]
431
+ {conversation_context}
432
+ """
433
+
434
+ return f"""
435
+ You are a Senior GitHub Repository Analyst.
436
+ {lang_instruction}
437
+
438
+ [Global Context - Repo Map]
439
+ {file_tree}
440
+
441
+ [Agent Analysis Summary]
442
+ {agent_summary}
443
+ {conversation_section}
444
+ [Current Code Context (Retrieved)]
445
+ {rag_context}
446
+ {tool_instruction}
447
+ """
448
+
449
+
450
+ def _build_conversation_context(memory: ConversationMemory) -> str:
451
+ """
452
+ 构建对话历史上下文字符串
453
+
454
+ 只包含最近几轮对话的摘要,用于 system prompt
455
+ """
456
+ messages = memory.get_context_messages()
457
+
458
+ if len(messages) <= 2:
459
+ # 只有当前轮,不需要历史
460
+ return ""
461
+
462
+ # 排除最后一条(当前用户消息)
463
+ history_messages = messages[:-1]
464
+
465
+ if not history_messages:
466
+ return ""
467
+
468
+ context_parts = []
469
+ for msg in history_messages[-6:]: # 最多 6 条(3 轮)
470
+ role = "User" if msg["role"] == "user" else "Assistant"
471
+ # 截断过长的内容
472
+ content = msg["content"][:500]
473
+ if len(msg["content"]) > 500:
474
+ content += "..."
475
+ context_parts.append(f"{role}: {content}")
476
+
477
+ return "\n".join(context_parts)
478
+
479
+
480
+ async def _compress_conversation_history(memory: ConversationMemory) -> None:
481
+ """
482
+ 压缩对话历史 - 使用 LLM 生成摘要
483
+ """
484
+ messages_to_summarize = memory.get_messages_to_summarize()
485
+
486
+ if not messages_to_summarize:
487
+ return
488
+
489
+ # 构建摘要请求
490
+ conversation_text = "\n".join([
491
+ f"{'User' if m['role'] == 'user' else 'Assistant'}: {m['content'][:300]}"
492
+ for m in messages_to_summarize
493
+ ])
494
+
495
+ prompt = f"""Summarize the following conversation in 2-3 sentences, focusing on:
496
+ 1. What questions were asked
497
+ 2. Key information discovered
498
+ 3. Important conclusions
499
+
500
+ Conversation:
501
+ {conversation_text}
502
+
503
+ Summary (be concise):"""
504
+
505
+ try:
506
+ response = await client.chat.completions.create(
507
+ model=settings.default_model_name,
508
+ messages=[{"role": "user", "content": prompt}],
509
+ temperature=0.3,
510
+ max_tokens=200
511
+ )
512
+ summary = response.choices[0].message.content.strip()
513
+
514
+ # 保存摘要
515
+ end_idx = len(memory._messages) - chat_config.max_history_turns * 2
516
+ memory.set_summary(summary, end_idx)
517
+
518
+ print(f"📝 Conversation compressed: {len(messages_to_summarize)} messages -> summary")
519
+ except Exception as e:
520
+ print(f"⚠️ Failed to compress conversation: {e}")
521
+
522
+
523
+ def _build_jit_status_message(
524
+ loaded_count: int,
525
+ failed_files: List[str],
526
+ remaining_rounds: int,
527
+ use_chinese: bool
528
+ ) -> str:
529
+ """构建 JIT 状态消息"""
530
+ if use_chinese:
531
+ if loaded_count > 0 and not failed_files:
532
+ return f"系统通知: 成功加载 {loaded_count} 个文件。"
533
+ elif loaded_count > 0 and failed_files:
534
+ failed_list = ", ".join(failed_files)
535
+ return f"系统通知: 加载了 {loaded_count} 个文件,但以下文件无法访问: {failed_list}。"
536
+ else:
537
+ failed_list = ", ".join(failed_files)
538
+ if remaining_rounds > 0:
539
+ return f"系统通知: 文件 ({failed_list}) 无法访问。你还有 {remaining_rounds} 次机会请求其他文件,或者基于现有上下文回答。"
540
+ else:
541
+ return f"系统通知: 文件 ({failed_list}) 无法访问。请基于现有上下文给出最佳回答。"
542
+ else:
543
+ if loaded_count > 0 and not failed_files:
544
+ return f"System Notification: Successfully loaded {loaded_count} files."
545
+ elif loaded_count > 0 and failed_files:
546
+ failed_list = ", ".join(failed_files)
547
+ return f"System Notification: Loaded {loaded_count} files, but the following could not be accessed: {failed_list}."
548
+ else:
549
+ failed_list = ", ".join(failed_files)
550
+ if remaining_rounds > 0:
551
+ return f"System Notification: Files ({failed_list}) could not be accessed. You have {remaining_rounds} more attempts to request other files, or answer based on available context."
552
+ else:
553
+ return f"System Notification: Files ({failed_list}) could not be accessed. Please provide the best possible answer based on existing context."
554
+
555
+ async def _download_and_index(vector_db, file_path):
556
+ """下载并索引文件"""
557
+ try:
558
+ content = await get_file_content(vector_db.repo_url, file_path)
559
+ if not content: return False
560
+
561
+ chunks = await asyncio.to_thread(chunker.chunk_file, content, file_path)
562
+ if not chunks:
563
+ chunks = [{
564
+ "content": content,
565
+ "metadata": {"file": file_path, "type": "text", "name": "root", "class": ""}
566
+ }]
567
+
568
+ documents = [c["content"] for c in chunks]
569
+ metadatas = []
570
+ for c in chunks:
571
+ meta = c["metadata"]
572
+ metadatas.append({
573
+ "file": meta["file"],
574
+ "type": meta["type"],
575
+ "name": meta.get("name", ""),
576
+ "class": meta.get("class") or ""
577
+ })
578
+ await vector_db.add_documents(documents, metadatas)
579
+ return True
580
+ except Exception as e:
581
+ print(f"Download Error: {e}")
582
+ return False
583
+
584
+
585
+ def _build_context(docs: List[Dict], max_chars: int = 2000) -> str:
586
+ """构建上下文字符串"""
587
+ if not docs:
588
+ return "(No relevant code snippets found yet)"
589
+
590
+ context = ""
591
+ for doc in docs:
592
+ file_info = doc.get('file', 'unknown')
593
+ metadata = doc.get('metadata', {})
594
+
595
+ if 'class' in metadata and metadata['class']:
596
+ file_info += f" (Class: {metadata['class']})"
597
+
598
+ content = doc.get('content', '')[:max_chars]
599
+ context += f"\n--- File: {file_info} ---\n{content}\n"
600
+
601
+ return context
app/services/chunking_service.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import re
3
+ import os
4
+ from dataclasses import dataclass
5
+
6
+ # --- 配置类 ---
7
+ @dataclass
8
+ class ChunkingConfig:
9
+ """
10
+ 统一管理切分服务的配置参数
11
+ """
12
+ min_chunk_size: int = 50 # 最小分块阈值 (chars)
13
+ max_chunk_size: int = 2000 # 最大分块阈值 (chars)
14
+ fallback_line_size: int = 100 # 兜底策略的行数 (lines)
15
+ max_context_chars: int = 500 # 允许注入到每个Chunk的上下文最大长度
16
+ # 超过此长度则不再注入,避免冗余内容撑爆 Token
17
+
18
+ class UniversalChunker:
19
+ def __init__(self, config: ChunkingConfig = None):
20
+ # 如果未传入配置,使用默认配置
21
+ self.config = config if config else ChunkingConfig()
22
+
23
+ def chunk_file(self, content: str, file_path: str):
24
+ if not content:
25
+ return []
26
+
27
+ ext = os.path.splitext(file_path)[1].lower()
28
+
29
+ if ext == '.py':
30
+ return self._chunk_python(content, file_path)
31
+
32
+ # 2. C-Style 语言优化
33
+ elif ext in ['.java', '.js', '.ts', '.jsx', '.tsx', '.go', '.cpp', '.c', '.h', '.cs', '.php', '.rs']:
34
+ return self._chunk_c_style(content, file_path)
35
+
36
+ else:
37
+ return self._fallback_chunking(content, file_path)
38
+
39
+ def _chunk_python(self, content, file_path):
40
+ """
41
+ 分级注入策略
42
+ """
43
+ chunks = []
44
+ try:
45
+ tree = ast.parse(content)
46
+ except SyntaxError:
47
+ return self._fallback_chunking(content, file_path)
48
+
49
+ import_nodes = []
50
+ other_nodes = []
51
+ function_class_chunks = []
52
+
53
+ # A. 遍历与分类
54
+ for node in tree.body:
55
+ if isinstance(node, ast.ClassDef):
56
+ class_code = ast.get_source_segment(content, node)
57
+ if not class_code: continue
58
+ if len(class_code) <= self.config.max_chunk_size:
59
+ function_class_chunks.append(self._create_chunk(
60
+ class_code, file_path, "class", node.name, node.lineno, node.name
61
+ ))
62
+ else:
63
+ # function_class_chunks 包含了从大类中拆分出的方法
64
+ function_class_chunks.extend(
65
+ self._chunk_large_python_class(node, content, file_path)
66
+ )
67
+
68
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
69
+ func_code = ast.get_source_segment(content, node)
70
+ if func_code and len(func_code) >= self.config.min_chunk_size:
71
+ function_class_chunks.append(self._create_chunk(
72
+ func_code, file_path, "function", node.name, node.lineno
73
+ ))
74
+
75
+ else:
76
+ segment = ast.get_source_segment(content, node)
77
+ if segment and len(segment.strip()) > 0:
78
+ if isinstance(node, (ast.Import, ast.ImportFrom)):
79
+ import_nodes.append(segment)
80
+ else:
81
+ other_nodes.append(segment)
82
+
83
+ # B. 决策准备
84
+ has_core_code = len(function_class_chunks) > 0
85
+ others_text = "\n".join(other_nodes).strip()
86
+ should_inject_others = len(others_text) <= self.config.max_context_chars
87
+
88
+ # C. 构建 Context Header
89
+ context_parts = []
90
+ # 1. Import 永远注入
91
+ if import_nodes:
92
+ context_parts.append("\n".join(import_nodes))
93
+ # 2. Globals 按需注入
94
+ if others_text and should_inject_others:
95
+ context_parts.append(others_text)
96
+
97
+ full_header = "\n".join(context_parts).strip()
98
+ if full_header:
99
+ full_header = f"# --- Context ---\n{full_header}\n# ---------------\n"
100
+
101
+ # D. 注入 Header 到核心 Chunk (函数/类)
102
+ # 此时 function_class_chunks 已经包含了大类拆分出来的方法
103
+ # 这里的循环会给它们都加上 Import/Global Context
104
+ for chunk in function_class_chunks:
105
+ chunk["content"] = full_header + chunk["content"]
106
+
107
+ # E. 处理溢出 (仅当有核心代码时,才独立存储溢出的 Globals)
108
+ if has_core_code and others_text and not should_inject_others:
109
+ chunks.append(self._create_chunk(
110
+ others_text, file_path, "global_context", "globals", 1
111
+ ))
112
+
113
+ # F. 纯脚本兜底
114
+ if not has_core_code:
115
+ # 这是一个纯脚本文件 (只有 Import 和 顶层逻辑)
116
+ full_script = (("\n".join(import_nodes) + "\n") if import_nodes else "") + others_text
117
+ if full_script.strip():
118
+ # 如果脚本太长,不要硬切成一个大块,而是走 Fallback 按行切分
119
+ if len(full_script) > self.config.max_chunk_size * 1.5: # 1.5倍宽容度
120
+ return self._fallback_chunking(content, file_path)
121
+ else:
122
+ chunks.append(self._create_chunk(
123
+ full_script, file_path, "script", "main", 1
124
+ ))
125
+
126
+ chunks.extend(function_class_chunks)
127
+
128
+ if not chunks and len(content.strip()) > 0:
129
+ return self._fallback_chunking(content, file_path)
130
+
131
+ return chunks
132
+
133
+ def _chunk_large_python_class(self, class_node, content, file_path):
134
+ chunks = []
135
+ class_name = class_node.name
136
+ docstring = ast.get_docstring(class_node) or ""
137
+
138
+ # === 尝试收集类级别的变量定义 ===
139
+ class_vars = []
140
+ for node in class_node.body:
141
+ # 如果是赋值语句,且在方法定义之前 (通常 AST 是有序的)
142
+ if isinstance(node, (ast.Assign, ast.AnnAssign)):
143
+ seg = ast.get_source_segment(content, node)
144
+ if seg: class_vars.append(seg)
145
+ # 一旦遇到函数,就停止收集变量,避免把乱七八糟的逻辑也收进去
146
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
147
+ break
148
+
149
+ vars_text = "\n ".join(class_vars)
150
+ if vars_text:
151
+ vars_text = "\n " + vars_text # 缩进对齐
152
+
153
+ # 将变量拼接到 Header 中
154
+ context_header = f"class {class_name}:{vars_text}\n \"\"\"{docstring}\"\"\"\n # ... (Parent Context)\n"
155
+
156
+ for node in class_node.body:
157
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
158
+ method_code = ast.get_source_segment(content, node)
159
+ if not method_code: continue
160
+
161
+ full_chunk_content = context_header + "\n" + method_code
162
+ chunks.append(self._create_chunk(
163
+ full_chunk_content, file_path, "method", node.name, node.lineno, class_name
164
+ ))
165
+ return chunks
166
+
167
+ def _chunk_c_style(self, content, file_path):
168
+ """
169
+ 解决宏干扰、全局变量丢失、跨行函数头问题
170
+ """
171
+ chunks = []
172
+ if not content: return []
173
+
174
+ # === 1. 定义正则 Token ===
175
+ # 使用 Named Groups 避免 startswith 的模糊匹配
176
+ # 顺序至关重要:长匹配优先
177
+ token_pattern = re.compile(
178
+ r'(?P<BLOCK_COMMENT>/\*.*?\*/)|' # 块注释
179
+ r'(?P<LINE_COMMENT>//[^\n]*)|' # 行注释
180
+ r'(?P<STRING>"(?:\\.|[^"\\])*")|' # 双引号字符串
181
+ r'(?P<CHAR>\'(?:\\.|[^\'\\])*\')|' # 单引号字符
182
+ r'(?P<TEMPLATE>`(?:\\.|[^`\\])*`)|' # 反引号模板 (JS/Go)
183
+ r'(?P<MACRO>^\s*#.*(?:\\\n.*)*)|' # 宏定义 (支持跨行)
184
+ r'(?P<BRACE_OPEN>\{)|' # 开括号
185
+ r'(?P<BRACE_CLOSE>\})|' # 闭括号
186
+ r'(?P<SEMICOLON>;)', # 分号 (用于分割全局变量和函数头)
187
+ re.DOTALL | re.MULTILINE
188
+ )
189
+
190
+ # 全局上下文收集器
191
+ global_context_parts = []
192
+
193
+ last_index = 0 # 上一个 Token 结束位置
194
+ block_start_index = 0 # 当前 Block (函数/类) 的签名开始位置
195
+
196
+ brace_balance = 0
197
+ in_structural_block = False # 是否在最外层的类/函数块内
198
+
199
+ # 暂存当前块的前置文本 (从上一个块结束 到 当前块开始)
200
+ # 这段文本里可能混杂着:全局变量、Import、以及当前函数的签名
201
+ pending_pre_text_start = 0
202
+
203
+ # 扫描
204
+ for match in token_pattern.finditer(content):
205
+ kind = match.lastgroup
206
+ start, end = match.span()
207
+
208
+ # 跳过非结构化 Token (注释、字符串、宏)
209
+ if kind in ('BLOCK_COMMENT', 'LINE_COMMENT', 'STRING', 'CHAR', 'TEMPLATE', 'MACRO'):
210
+ continue
211
+
212
+ # 忽略括号 () 和 [],只认 {}。
213
+ # C-style 语言只有 {} 定义 Scope Body。忽略 () [] 是为了防止 if(a[i]){...} 误判。
214
+ # 只要 regex 不匹配 () [],它们就被视为普通文本,不会影响 brace_balance。
215
+ if kind == 'BRACE_OPEN':
216
+ if brace_balance == 0:
217
+ # === 发现一个新的顶层 Block ===
218
+ in_structural_block = True
219
+
220
+ # 1. 分析 "空隙文本" (从上一个块结束 到 这个 { 之前)
221
+ gap_text = content[pending_pre_text_start:start]
222
+
223
+ # [策略] 拆分 Global Context 和 Signature
224
+ # 寻找最后一个分号 ';' 或 '}' (在 gap_text 内部的逻辑结束点)
225
+ # 倒序查找比较安全。
226
+ # 如果找不到,说明整段 gap 都是签名 (e.g. void foo() {)
227
+ # 如果找到,分号前是 Global,分号后是 Signature
228
+ split_idx = gap_text.rfind(';')
229
+ if split_idx != -1:
230
+ # 分号前:归入全局上下文
231
+ global_part = gap_text[:split_idx+1].strip()
232
+ if global_part:
233
+ global_context_parts.append(global_part)
234
+ # 分号后:是当前函数的签名
235
+ # 自动处理了跨行函数头,因为 gap_text 包含换行
236
+ block_signature_start = pending_pre_text_start + split_idx + 1
237
+ else:
238
+ # 没有分号,假设全是签名 (e.g. 紧接着上一个块,或者是文件开头)
239
+ # 但要小心 include/import 等没有分号的语句 (Python 思维在 C 里不适用,C 几乎都有分号)
240
+ # Go 语言除外 (Go 没分号)。这里做一个简单的 heuristic:
241
+ # 如果是 Go/JS/TS,可能没有分号。暂且全部视为 Signature,
242
+ # 除非它看起来像 import。
243
+ # 这是一个 trade-off。
244
+ block_signature_start = pending_pre_text_start
245
+
246
+ # 记录当前 Block 真正的“视觉开始点” (包含签名)
247
+ block_start_index = block_signature_start
248
+
249
+ brace_balance += 1
250
+
251
+ elif kind == 'BRACE_CLOSE':
252
+ brace_balance -= 1
253
+
254
+ if brace_balance == 0 and in_structural_block:
255
+ # === 顶层 Block 结束 ===
256
+ in_structural_block = False
257
+
258
+ # 提取完整代码块 (Signature + Body)
259
+ # 范围:block_start_index -> end
260
+ full_block_text = content[block_start_index:end]
261
+
262
+ # 小块合并策略
263
+ # 如果块太小 (e.g. Getter/Setter),暂不生成 Chunk
264
+ # 架构决策:为了代码完整性,工业界 RAG 通常不建议丢弃小块,
265
+ # 尤其是 Getter/Setter 可能包含关键字段名。
266
+ # 这里我们生成 Chunk,但后续入库时可以由 Embedding 模型决定权重。
267
+
268
+ # 提取元数据
269
+ meta = self._extract_c_style_metadata(full_block_text)
270
+ start_line = content.count('\n', 0, block_start_index) + 1
271
+
272
+ chunks.append(self._create_chunk(
273
+ full_block_text, # 暂时不加 Global Header,最后统一加
274
+ file_path, meta["type"], meta["name"], start_line
275
+ ))
276
+
277
+ # 更新游标:下一个块的前置文本从这里开始
278
+ pending_pre_text_start = end
279
+
280
+ # === 循环结束后的收尾 ===
281
+ # 处理文件末尾的剩余文本 (Tail)
282
+ tail_text = content[pending_pre_text_start:].strip()
283
+ if tail_text:
284
+ global_context_parts.append(tail_text)
285
+
286
+ # === Global Context 重排序 ===
287
+ # 目标顺序: Includes > Macros (#define) > Others (Typedefs/Vars)
288
+ # 简单策略:基于字符串内容的优先级排序
289
+
290
+ def context_priority(text):
291
+ text = text.strip()
292
+ if text.startswith("#include") or text.startswith("import") or text.startswith("using"):
293
+ return 0 # 最高优先级
294
+ if text.startswith("#define") or text.startswith("#macro"):
295
+ return 1 # 宏定义
296
+ if text.startswith("typedef") or text.startswith("enum") or text.startswith("struct"):
297
+ return 2 # 类型定义
298
+ return 3 # 普通全局变量和其他
299
+
300
+ # 稳定排序
301
+ global_context_parts.sort(key=context_priority)
302
+
303
+ # === 组装与注入 ===
304
+ full_global_context = "\n".join(global_context_parts).strip()
305
+
306
+ should_inject = len(full_global_context) <= self.config.max_context_chars
307
+
308
+ context_header = ""
309
+ if full_global_context and should_inject:
310
+ context_header = f"/* --- Global Context --- */\n{full_global_context}\n/* ---------------------- */\n"
311
+
312
+ for chunk in chunks:
313
+ chunk["content"] = context_header + chunk["content"]
314
+
315
+ if (full_global_context and not should_inject) or (not chunks and full_global_context):
316
+ chunks.insert(0, self._create_chunk(
317
+ full_global_context, file_path, "global_context", "header", 1
318
+ ))
319
+
320
+ if not chunks:
321
+ return self._fallback_chunking(content, file_path)
322
+
323
+ return chunks
324
+
325
+ def _extract_c_style_metadata(self, code_block):
326
+ """
327
+ 从包含签名的代码块中提取元数据 (支持多行签名)
328
+ """
329
+ # 截取到第一个 { 为止
330
+ header_part = code_block.split('{')[0]
331
+ # 压缩多余空白,变成单行以便正则匹配
332
+ header_clean = " ".join(header_part.split())
333
+
334
+ # 1. Class/Struct/Interface
335
+ type_pattern = re.compile(r'\b(class|struct|interface|enum|record|type)\s+([a-zA-Z0-9_]+)')
336
+ match = type_pattern.search(header_clean)
337
+ if match:
338
+ return {"type": "class", "name": match.group(2)}
339
+
340
+ # 2. Function
341
+ # 匹配: 单词 + (
342
+ # 排除关键字: if, for, while, switch, catch, return
343
+ func_pattern = re.compile(r'\b([a-zA-Z0-9_]+)\s*\(')
344
+ for match in func_pattern.finditer(header_clean):
345
+ name = match.group(1)
346
+ if name not in {'if', 'for', 'while', 'switch', 'catch', 'return', 'sizeof'}:
347
+ return {"type": "function", "name": name}
348
+
349
+ return {"type": "code_block", "name": "anonymous"}
350
+
351
+ def _fallback_chunking(self, content, file_path):
352
+ """兜底策略:使用 Config 中的行数设置"""
353
+ chunks = []
354
+ lines = content.split('\n')
355
+ chunk_size = self.config.fallback_line_size
356
+
357
+ for i in range(0, len(lines), chunk_size):
358
+ chunk_content = "\n".join(lines[i:i+chunk_size])
359
+ chunks.append(self._create_chunk(chunk_content, file_path, "text_chunk", f"chunk_{i}", i+1))
360
+ return chunks
361
+
362
+ def _create_chunk(self, content, file_path, type_, name, start_line, class_name=""):
363
+ return {
364
+ "content": content,
365
+ "metadata": {
366
+ "file": file_path,
367
+ "type": type_,
368
+ "name": name,
369
+ "start_line": start_line,
370
+ "class": class_name
371
+ }
372
+ }
app/services/github_service.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ GitHub 服务层
4
+
5
+ 职责:
6
+ - 提供业务级别的 GitHub 操作
7
+ - 封装底层客户端,提供简洁 API
8
+ - 保持向后兼容的函数签名
9
+ """
10
+
11
+ import logging
12
+ from typing import List, Optional, Dict
13
+
14
+ from app.utils.github_client import (
15
+ GitHubClient,
16
+ GitHubRepo,
17
+ GitHubFile,
18
+ FileFilter,
19
+ GitHubError,
20
+ GitHubNotFoundError,
21
+ get_github_client,
22
+ parse_repo_url,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # ============================================================
29
+ # 服务类
30
+ # ============================================================
31
+
32
+ class GitHubService:
33
+ """
34
+ GitHub 服务
35
+
36
+ 提供高层业务操作,内部使用异步客户端。
37
+
38
+ 使用示例:
39
+ ```python
40
+ service = GitHubService()
41
+
42
+ # 获取仓库文件列表
43
+ files = await service.get_repo_structure("https://github.com/owner/repo")
44
+
45
+ # 获取文件内容
46
+ content = await service.get_file_content(
47
+ "https://github.com/owner/repo",
48
+ "src/main.py"
49
+ )
50
+
51
+ # 批量获取文件
52
+ contents = await service.get_files_content(
53
+ "https://github.com/owner/repo",
54
+ ["README.md", "src/main.py", "requirements.txt"]
55
+ )
56
+ ```
57
+ """
58
+
59
+ def __init__(self, client: Optional[GitHubClient] = None):
60
+ self._client = client
61
+
62
+ @property
63
+ def client(self) -> GitHubClient:
64
+ """获取客户端 (延迟初始化)"""
65
+ if self._client is None:
66
+ self._client = get_github_client()
67
+ return self._client
68
+
69
+ async def _get_repo_from_url(self, repo_url: str) -> GitHubRepo:
70
+ """从 URL 获取仓库对象"""
71
+ parsed = parse_repo_url(repo_url)
72
+ if not parsed:
73
+ raise ValueError(f"无效的 GitHub URL: {repo_url}")
74
+
75
+ owner, name = parsed
76
+ return await self.client.get_repo(owner, name)
77
+
78
+ async def get_repo_structure(
79
+ self,
80
+ repo_url: str,
81
+ file_filter: Optional[FileFilter] = None
82
+ ) -> List[str]:
83
+ """
84
+ 获取仓库文件列表
85
+
86
+ Args:
87
+ repo_url: GitHub 仓库 URL
88
+ file_filter: 自定义文件过滤器
89
+
90
+ Returns:
91
+ 文件路径列表
92
+ """
93
+ repo = await self._get_repo_from_url(repo_url)
94
+ files = await self.client.get_repo_tree(repo, file_filter)
95
+ return [f.path for f in files]
96
+
97
+ async def get_file_content(
98
+ self,
99
+ repo_url: str,
100
+ file_path: str
101
+ ) -> Optional[str]:
102
+ """
103
+ 获取单个文件内容
104
+
105
+ Args:
106
+ repo_url: GitHub 仓库 URL
107
+ file_path: 文件路径
108
+
109
+ Returns:
110
+ 文件内容,失败返回 None
111
+ """
112
+ repo = await self._get_repo_from_url(repo_url)
113
+ return await self.client.get_file_content(repo, file_path)
114
+
115
+ async def get_files_content(
116
+ self,
117
+ repo_url: str,
118
+ file_paths: List[str]
119
+ ) -> Dict[str, Optional[str]]:
120
+ """
121
+ 批量获取文件内容 (并发)
122
+
123
+ Args:
124
+ repo_url: GitHub 仓库 URL
125
+ file_paths: 文件路径列表
126
+
127
+ Returns:
128
+ {path: content} 字典
129
+ """
130
+ repo = await self._get_repo_from_url(repo_url)
131
+ return await self.client.get_files_content(repo, file_paths, show_progress=True)
132
+
133
+ async def get_repo_info(self, repo_url: str) -> GitHubRepo:
134
+ """
135
+ 获取仓库基本信息
136
+
137
+ Args:
138
+ repo_url: GitHub 仓库 URL
139
+
140
+ Returns:
141
+ GitHubRepo 对象
142
+ """
143
+ return await self._get_repo_from_url(repo_url)
144
+
145
+
146
+ # ============================================================
147
+ # 全局服务实例
148
+ # ============================================================
149
+
150
+ _github_service: Optional[GitHubService] = None
151
+
152
+
153
+ def get_github_service() -> GitHubService:
154
+ """获取 GitHub 服务单例"""
155
+ global _github_service
156
+ if _github_service is None:
157
+ _github_service = GitHubService()
158
+ return _github_service
159
+
160
+
161
+ # ============================================================
162
+ # 兼容旧接口 (同步风格的函数签名,但返回协程)
163
+ # ============================================================
164
+
165
+ # 保留 parse_repo_url 的旧签名兼容
166
+ def parse_repo_url_compat(url: str) -> Optional[str]:
167
+ """
168
+ 解析 GitHub URL (兼容旧接口)
169
+
170
+ Returns:
171
+ "owner/repo" 字符串,无效返回 None
172
+ """
173
+ result = parse_repo_url(url)
174
+ if result:
175
+ return f"{result[0]}/{result[1]}"
176
+ return None
177
+
178
+
179
+ async def get_repo_structure(repo_url: str) -> List[str]:
180
+ """
181
+ 获取仓库文件列表 (兼容旧接口)
182
+
183
+ 注意: 这是一个异步函数,需要 await 调用
184
+ """
185
+ service = get_github_service()
186
+ return await service.get_repo_structure(repo_url)
187
+
188
+
189
+ async def get_file_content(repo_url: str, file_path: str) -> Optional[str]:
190
+ """
191
+ 获取文件内容 (兼容旧接口)
192
+
193
+ 注意: 这是一个异步函数,需要 await 调用
194
+ """
195
+ service = get_github_service()
196
+ return await service.get_file_content(repo_url, file_path)
197
+
198
+
199
+ # 导出
200
+ __all__ = [
201
+ "GitHubService",
202
+ "get_github_service",
203
+ "get_repo_structure",
204
+ "get_file_content",
205
+ "parse_repo_url_compat",
206
+ "GitHubError",
207
+ "GitHubNotFoundError",
208
+ "FileFilter",
209
+ "GitHubRepo",
210
+ ]
app/services/tracing_service.py ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/services/tracing_service.py
2
+ """
3
+ Langfuse集成模块 - 用于端到端追踪和观测
4
+
5
+ 核心能力:
6
+ 1. 自动捕获每一步的延迟、Token成本、输入输出
7
+ 2. 记录完整的调用链路: Query -> Rewrite -> Retrieval -> Generation
8
+ 3. 记录Tool调用和参数
9
+ 4. 集成到评估流程
10
+
11
+ Langfuse支持:
12
+ - 本地部署 (docker run ... langfuse)
13
+ - 云端托管 (app.langfuse.com)
14
+
15
+ Author: Dexter
16
+ Date: 2025-01-27
17
+ """
18
+
19
+ import time
20
+ import json
21
+ import os
22
+ from typing import Dict, Any, Optional, List, Callable
23
+ from functools import wraps
24
+ from datetime import datetime
25
+ from dataclasses import dataclass
26
+
27
+
28
+ # ============================================================================
29
+ # 第一部分: Langfuse客户端初始化 (可选)
30
+ # ============================================================================
31
+
32
+ LANGFUSE_IMPORT_ERROR = None
33
+ _LANGFUSE_ENABLED_ENV = os.getenv("LANGFUSE_ENABLED", "true").strip().lower()
34
+ _LANGFUSE_ENABLED = _LANGFUSE_ENABLED_ENV not in {"0", "false", "no", "off"}
35
+
36
+ if _LANGFUSE_ENABLED:
37
+ try:
38
+ from langfuse import Langfuse
39
+ from langfuse.decorators import observe, langfuse_context
40
+ LANGFUSE_AVAILABLE = True
41
+ except Exception as e:
42
+ LANGFUSE_IMPORT_ERROR = e
43
+ LANGFUSE_AVAILABLE = False
44
+ else:
45
+ LANGFUSE_AVAILABLE = False
46
+
47
+
48
+ @dataclass
49
+ class TracingConfig:
50
+ """追踪配置"""
51
+ enabled: bool = True
52
+ backend: str = "langfuse" # "langfuse" or "local"
53
+ langfuse_host: str = os.getenv("LANGFUSE_HOST", "http://localhost:3000")
54
+ langfuse_public_key: str = os.getenv("LANGFUSE_PUBLIC_KEY", "")
55
+ langfuse_secret_key: str = os.getenv("LANGFUSE_SECRET_KEY", "")
56
+ capture_token_usage: bool = True
57
+ capture_latency: bool = True
58
+ local_log_dir: str = "logs/traces"
59
+
60
+
61
+ class TracingService:
62
+ """
63
+ 统一的追踪服务
64
+ 支持Langfuse和本地日志两种后端
65
+ """
66
+
67
+ def __init__(self, config: TracingConfig = None):
68
+ self.config = config or TracingConfig()
69
+ self.langfuse_client = None
70
+ self.current_trace_id = None
71
+
72
+ if self.config.enabled and self.config.backend == "langfuse":
73
+ if not LANGFUSE_AVAILABLE:
74
+ print("⚠️ Langfuse not installed. Install with: pip install langfuse. Falling back to local logging.")
75
+ self.config.backend = "local"
76
+ else:
77
+ try:
78
+ self.langfuse_client = Langfuse(
79
+ host=self.config.langfuse_host,
80
+ public_key=self.config.langfuse_public_key,
81
+ secret_key=self.config.langfuse_secret_key,
82
+ enabled=True,
83
+ debug=False
84
+ )
85
+ print("✅ Langfuse client initialized successfully")
86
+ except Exception as e:
87
+ print(f"⚠️ Langfuse initialization failed: {e}. Falling back to local logging.")
88
+ self.config.backend = "local"
89
+
90
+ # 创建本地日志目录
91
+ os.makedirs(self.config.local_log_dir, exist_ok=True)
92
+
93
+ def start_trace(self, trace_name: str, session_id: str, metadata: Dict = None) -> str:
94
+ """启动一个新的追踪链"""
95
+ import uuid
96
+ trace_id = str(uuid.uuid4())
97
+ self.current_trace_id = trace_id
98
+
99
+ if self.langfuse_client:
100
+ self.langfuse_client.trace(
101
+ name=trace_name,
102
+ input=metadata or {},
103
+ session_id=session_id
104
+ )
105
+ print(f"📍 Trace started: {trace_id}")
106
+ else:
107
+ self._log_locally("trace_start", {
108
+ "trace_id": trace_id,
109
+ "name": trace_name,
110
+ "session_id": session_id,
111
+ "metadata": metadata,
112
+ "timestamp": datetime.now().isoformat()
113
+ })
114
+
115
+ return trace_id
116
+
117
+ def record_span(
118
+ self,
119
+ span_name: str,
120
+ operation: str,
121
+ input_data: Any,
122
+ output_data: Any,
123
+ latency_ms: float,
124
+ token_usage: Dict[str, int] = None,
125
+ metadata: Dict = None
126
+ ) -> None:
127
+ """记录一个操作的跨度"""
128
+
129
+ span_record = {
130
+ "span_name": span_name,
131
+ "operation": operation,
132
+ "latency_ms": latency_ms,
133
+ "timestamp": datetime.now().isoformat(),
134
+ "token_usage": token_usage or {},
135
+ "metadata": metadata or {}
136
+ }
137
+
138
+ if self.langfuse_client:
139
+ try:
140
+ # Langfuse:记录到云端
141
+ self.langfuse_client.span(
142
+ name=span_name,
143
+ input=input_data,
144
+ output=output_data,
145
+ metadata={
146
+ "operation": operation,
147
+ "latency_ms": latency_ms,
148
+ **(token_usage or {}),
149
+ **(metadata or {})
150
+ }
151
+ )
152
+ except Exception as e:
153
+ print(f"⚠️ Failed to record span to Langfuse: {e}")
154
+
155
+ # 本地日志
156
+ self._log_locally("span", span_record)
157
+
158
+ def record_tool_call(
159
+ self,
160
+ tool_name: str,
161
+ parameters: Dict,
162
+ result: Any,
163
+ latency_ms: float,
164
+ success: bool,
165
+ error: str = None
166
+ ) -> None:
167
+ """记录工具调用"""
168
+
169
+ tool_record = {
170
+ "tool_name": tool_name,
171
+ "parameters": parameters,
172
+ "result": str(result)[:500] if result else None,
173
+ "latency_ms": latency_ms,
174
+ "success": success,
175
+ "error": error,
176
+ "timestamp": datetime.now().isoformat()
177
+ }
178
+
179
+ if self.langfuse_client:
180
+ try:
181
+ self.langfuse_client.event(
182
+ name=f"tool_call:{tool_name}",
183
+ input=parameters,
184
+ output=result,
185
+ metadata={
186
+ "latency_ms": latency_ms,
187
+ "success": success,
188
+ "error": error
189
+ }
190
+ )
191
+ except Exception as e:
192
+ print(f"⚠️ Failed to record tool call: {e}")
193
+
194
+ self._log_locally("tool_call", tool_record)
195
+
196
+ def record_retrieval_debug(
197
+ self,
198
+ query: str,
199
+ retrieved_files: List[str],
200
+ vector_scores: List[float],
201
+ bm25_scores: List[float],
202
+ latency_ms: float
203
+ ) -> None:
204
+ """记录检索过程的调试信息"""
205
+
206
+ retrieval_record = {
207
+ "query": query,
208
+ "retrieved_count": len(retrieved_files),
209
+ "files": retrieved_files,
210
+ "vector_scores": vector_scores,
211
+ "bm25_scores": bm25_scores,
212
+ "latency_ms": latency_ms,
213
+ "timestamp": datetime.now().isoformat()
214
+ }
215
+
216
+ if self.langfuse_client:
217
+ try:
218
+ self.langfuse_client.event(
219
+ name="retrieval_debug",
220
+ input={"query": query},
221
+ output={"files": retrieved_files},
222
+ metadata=retrieval_record
223
+ )
224
+ except Exception as e:
225
+ print(f"⚠️ Failed to record retrieval debug: {e}")
226
+
227
+ self._log_locally("retrieval", retrieval_record)
228
+
229
+ def record_llm_generation(
230
+ self,
231
+ model: str,
232
+ prompt_messages: List[Dict],
233
+ generated_text: str,
234
+ ttft_ms: float = None,
235
+ total_latency_ms: float = None,
236
+ prompt_tokens: int = None,
237
+ completion_tokens: int = None,
238
+ total_tokens: int = None,
239
+ is_streaming: bool = False,
240
+ metadata: Dict = None
241
+ ) -> None:
242
+ """
243
+ 记录 LLM 生成的完整信息,包括 Token 消耗和 TTFT
244
+
245
+ Args:
246
+ model: 模型名称 (如 "gpt-4", "claude-3")
247
+ prompt_messages: 发送给 LLM 的消息列表
248
+ generated_text: 生成的文本(可截断)
249
+ ttft_ms: Time To First Token,首 token 延迟(毫秒)
250
+ total_latency_ms: 总生成延迟(毫秒)
251
+ prompt_tokens: 输入 token 数
252
+ completion_tokens: 输出 token 数
253
+ total_tokens: 总 token 数
254
+ is_streaming: 是否流式输出
255
+ metadata: 额外元数据
256
+ """
257
+ llm_record = {
258
+ "model": model,
259
+ "is_streaming": is_streaming,
260
+ "prompt_preview": str(prompt_messages)[:500], # 截断避免日志过大
261
+ "generated_preview": generated_text[:500] if generated_text else "",
262
+ "generated_length": len(generated_text) if generated_text else 0,
263
+ # Token 统计
264
+ "token_usage": {
265
+ "prompt_tokens": prompt_tokens,
266
+ "completion_tokens": completion_tokens,
267
+ "total_tokens": total_tokens
268
+ },
269
+ # 延迟统计
270
+ "latency": {
271
+ "ttft_ms": ttft_ms, # Time To First Token
272
+ "total_ms": total_latency_ms,
273
+ "tokens_per_second": round(completion_tokens / (total_latency_ms / 1000), 2)
274
+ if completion_tokens and total_latency_ms and total_latency_ms > 0 else None
275
+ },
276
+ "timestamp": datetime.now().isoformat(),
277
+ "metadata": metadata or {}
278
+ }
279
+
280
+ if self.langfuse_client:
281
+ try:
282
+ self.langfuse_client.generation(
283
+ name="llm_generation",
284
+ model=model,
285
+ input=prompt_messages,
286
+ output=generated_text[:1000] if generated_text else "",
287
+ usage={
288
+ "prompt_tokens": prompt_tokens or 0,
289
+ "completion_tokens": completion_tokens or 0,
290
+ "total_tokens": total_tokens or 0
291
+ },
292
+ metadata={
293
+ "ttft_ms": ttft_ms,
294
+ "total_latency_ms": total_latency_ms,
295
+ "is_streaming": is_streaming,
296
+ **(metadata or {})
297
+ }
298
+ )
299
+ except Exception as e:
300
+ print(f"⚠️ Failed to record LLM generation to Langfuse: {e}")
301
+
302
+ self._log_locally("llm_generation", llm_record)
303
+
304
+ def record_ttft(self, ttft_ms: float, model: str = None, metadata: Dict = None) -> None:
305
+ """
306
+ 单独记录 TTFT (Time To First Token)
307
+ 用于流式生成时在收到第一个 token 时立即记录
308
+
309
+ Args:
310
+ ttft_ms: 首 token 延迟(毫秒)
311
+ model: 模型名称
312
+ metadata: 额外元数据
313
+ """
314
+ ttft_record = {
315
+ "ttft_ms": ttft_ms,
316
+ "model": model,
317
+ "timestamp": datetime.now().isoformat(),
318
+ "metadata": metadata or {}
319
+ }
320
+
321
+ if self.langfuse_client:
322
+ try:
323
+ self.langfuse_client.event(
324
+ name="ttft",
325
+ input={},
326
+ output={"ttft_ms": ttft_ms},
327
+ metadata=ttft_record
328
+ )
329
+ except Exception as e:
330
+ print(f"⚠️ Failed to record TTFT: {e}")
331
+
332
+ self._log_locally("ttft", ttft_record)
333
+
334
+ def add_event(self, event_name: str, event_data: Dict[str, Any] = None) -> None:
335
+ """
336
+ 添加事件记录
337
+
338
+ Args:
339
+ event_name: 事件名称 (如 "repo_map_generated", "file_read_failed" 等)
340
+ event_data: 事件相关数据
341
+ """
342
+ event_record = {
343
+ "event_name": event_name,
344
+ "event_data": event_data or {},
345
+ "timestamp": datetime.now().isoformat()
346
+ }
347
+
348
+ if self.langfuse_client:
349
+ try:
350
+ self.langfuse_client.event(
351
+ name=event_name,
352
+ input={},
353
+ output=event_data or {},
354
+ metadata=event_data or {}
355
+ )
356
+ except Exception as e:
357
+ print(f"⚠️ Failed to record event '{event_name}': {e}")
358
+
359
+ self._log_locally("event", event_record)
360
+
361
+ def _log_locally(self, log_type: str, data: Dict) -> None:
362
+ """本地日志记录"""
363
+ log_file = os.path.join(
364
+ self.config.local_log_dir,
365
+ f"{log_type}_{datetime.now().strftime('%Y%m%d')}.jsonl"
366
+ )
367
+
368
+ with open(log_file, 'a', encoding='utf-8') as f:
369
+ f.write(json.dumps(data, ensure_ascii=False, default=str) + '\n')
370
+
371
+ def get_trace_url(self, trace_id: str = None) -> str:
372
+ """获取Langfuse中该trace的URL (用于前端跳转)"""
373
+ if not self.langfuse_client or not trace_id:
374
+ return None
375
+
376
+ # Langfuse云端URL格式
377
+ return f"{self.config.langfuse_host}/traces/{trace_id}"
378
+
379
+
380
+ # ============================================================================
381
+ # 第二部分: 装饰器 - 自动追踪
382
+ # ============================================================================
383
+
384
+ def traced(operation_name: str, capture_args: List[str] = None):
385
+ """
386
+ 装饰器: 自动为被装饰函数添加追踪
387
+
388
+ 使用示例:
389
+ @traced("query_rewrite", capture_args=["user_query"])
390
+ async def rewrite_query(user_query: str):
391
+ ...
392
+ """
393
+
394
+ def decorator(func: Callable):
395
+ @wraps(func)
396
+ async def async_wrapper(*args, **kwargs):
397
+ start_time = time.time()
398
+
399
+ # 捕获输入参数
400
+ input_data = {}
401
+ if capture_args:
402
+ for arg_name in capture_args:
403
+ if arg_name in kwargs:
404
+ input_data[arg_name] = kwargs[arg_name]
405
+
406
+ try:
407
+ result = await func(*args, **kwargs)
408
+ latency_ms = (time.time() - start_time) * 1000
409
+
410
+ # 记录跨度
411
+ tracing_service.record_span(
412
+ span_name=operation_name,
413
+ operation=func.__name__,
414
+ input_data=input_data,
415
+ output_data={"success": True},
416
+ latency_ms=latency_ms
417
+ )
418
+
419
+ return result
420
+ except Exception as e:
421
+ latency_ms = (time.time() - start_time) * 1000
422
+ tracing_service.record_span(
423
+ span_name=operation_name,
424
+ operation=func.__name__,
425
+ input_data=input_data,
426
+ output_data={"error": str(e)},
427
+ latency_ms=latency_ms,
428
+ metadata={"error": True}
429
+ )
430
+ raise
431
+
432
+ @wraps(func)
433
+ def sync_wrapper(*args, **kwargs):
434
+ start_time = time.time()
435
+
436
+ input_data = {}
437
+ if capture_args:
438
+ for arg_name in capture_args:
439
+ if arg_name in kwargs:
440
+ input_data[arg_name] = kwargs[arg_name]
441
+
442
+ try:
443
+ result = func(*args, **kwargs)
444
+ latency_ms = (time.time() - start_time) * 1000
445
+
446
+ tracing_service.record_span(
447
+ span_name=operation_name,
448
+ operation=func.__name__,
449
+ input_data=input_data,
450
+ output_data={"success": True},
451
+ latency_ms=latency_ms
452
+ )
453
+
454
+ return result
455
+ except Exception as e:
456
+ latency_ms = (time.time() - start_time) * 1000
457
+ tracing_service.record_span(
458
+ span_name=operation_name,
459
+ operation=func.__name__,
460
+ input_data=input_data,
461
+ output_data={"error": str(e)},
462
+ latency_ms=latency_ms,
463
+ metadata={"error": True}
464
+ )
465
+ raise
466
+
467
+ # 判断是async还是sync
468
+ if asyncio.iscoroutinefunction(func):
469
+ return async_wrapper
470
+ else:
471
+ return sync_wrapper
472
+
473
+ return decorator
474
+
475
+
476
+ # ============================================================================
477
+ # 第三部分: 全局实例
478
+ # ============================================================================
479
+
480
+ tracing_config = TracingConfig(
481
+ enabled=True,
482
+ backend="langfuse" if LANGFUSE_AVAILABLE else "local"
483
+ )
484
+
485
+ tracing_service = TracingService(config=tracing_config)
486
+
487
+
488
+ # ============================================================================
489
+ # 第四部分: 集成示例 (如何在agent_service.py中使用)
490
+ # ============================================================================
491
+
492
+ """
493
+ 在你的agent_service.py中添加:
494
+
495
+ 1. 导入追踪服务:
496
+ from app.services.tracing_service import tracing_service
497
+
498
+ 2. 在agent_stream函数开始:
499
+ trace_id = tracing_service.start_trace(
500
+ trace_name="github_agent_analysis",
501
+ session_id=session_id,
502
+ metadata={"repo_url": repo_url, "language": language}
503
+ )
504
+
505
+ 3. 在generate_repo_map函数周围:
506
+ start_time = time.time()
507
+ file_tree_str, mapped_files = await generate_repo_map(repo_url, file_list, limit=limit)
508
+ latency_ms = (time.time() - start_time) * 1000
509
+
510
+ tracing_service.record_span(
511
+ span_name="generate_repo_map",
512
+ operation="repo_mapping",
513
+ input_data={"file_count": len(file_list), "limit": limit},
514
+ output_data={"files_in_map": len(mapped_files)},
515
+ latency_ms=latency_ms
516
+ )
517
+
518
+ 4. 在process_single_file中记录检索:
519
+ tracing_service.record_retrieval_debug(
520
+ query=search_query,
521
+ retrieved_files=valid_files,
522
+ vector_scores=vector_scores,
523
+ bm25_scores=bm25_scores,
524
+ latency_ms=search_latency
525
+ )
526
+
527
+ 5. 工具调用记录:
528
+ start_time = time.time()
529
+ try:
530
+ result = get_file_content(repo_url, file_path)
531
+ tracing_service.record_tool_call(
532
+ tool_name="get_file_content",
533
+ parameters={"file_path": file_path},
534
+ result=result[:100] if result else None,
535
+ latency_ms=(time.time() - start_time) * 1000,
536
+ success=True
537
+ )
538
+ except Exception as e:
539
+ tracing_service.record_tool_call(
540
+ tool_name="get_file_content",
541
+ parameters={"file_path": file_path},
542
+ result=None,
543
+ latency_ms=(time.time() - start_time) * 1000,
544
+ success=False,
545
+ error=str(e)
546
+ )
547
+ """
548
+
549
+ import asyncio
app/services/vector_service.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 向量服务层 - Qdrant 版
4
+
5
+ 特性:
6
+ 1. 混合搜索 - Qdrant 向量 + BM25 关键词,RRF 融合
7
+ 2. 异步原生 - 全链路异步
8
+ 3. 会话隔离 - 每个 session 独立集合
9
+ 4. 状态持久化 - 仓库信息、BM25 索引缓存
10
+ """
11
+
12
+ import asyncio
13
+ import json
14
+ import logging
15
+ import os
16
+ import pickle
17
+ import re
18
+ import tempfile
19
+ import time
20
+ from dataclasses import dataclass, field
21
+ from typing import List, Dict, Any, Optional, Set
22
+
23
+ from rank_bm25 import BM25Okapi
24
+
25
+ from app.core.config import settings
26
+ from app.storage.base import Document, SearchResult, CollectionStats
27
+ from app.storage.qdrant_store import QdrantVectorStore, QdrantConfig, get_qdrant_factory
28
+ from app.utils.embedding import get_embedding_service, EmbeddingConfig
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ # ============================================================
34
+ # 使用统一配置
35
+ # ============================================================
36
+
37
+ from app.core.config import vector_config as config
38
+
39
+ # 确保目录存在
40
+ os.makedirs(config.context_dir, exist_ok=True)
41
+
42
+ # === 向后兼容导出 (供 main.py 使用) ===
43
+ vector_config = config # 兼容旧名称
44
+ CONTEXT_DIR = config.context_dir
45
+ QDRANT_DIR = config.data_dir # Qdrant 数据目录
46
+
47
+
48
+ # ============================================================
49
+ # Embedding 服务
50
+ # ============================================================
51
+
52
+ _embedding_service = None
53
+
54
+ def get_embedding():
55
+ """获取 Embedding 服务单例"""
56
+ global _embedding_service
57
+ if _embedding_service is None:
58
+ emb_config = EmbeddingConfig(
59
+ api_base_url=config.embedding_api_url,
60
+ model_name=config.embedding_model,
61
+ batch_size=config.embedding_batch_size,
62
+ max_text_length=config.embedding_max_length,
63
+ max_concurrent_batches=config.embedding_concurrency,
64
+ )
65
+ _embedding_service = get_embedding_service(emb_config)
66
+ return _embedding_service
67
+
68
+
69
+ # ============================================================
70
+ # 向量存储服务
71
+ # ============================================================
72
+
73
+ class VectorStore:
74
+ """
75
+ 向量存储服务
76
+
77
+ 整合 Qdrant 向量搜索和 BM25 关键词搜索
78
+
79
+ 使用示例:
80
+ ```python
81
+ store = VectorStore("session_123")
82
+ await store.initialize()
83
+
84
+ # 重置 (分析新仓库时)
85
+ await store.reset()
86
+
87
+ # 添加文档
88
+ await store.add_documents(documents, metadatas)
89
+
90
+ # 混合搜索
91
+ results = await store.search_hybrid("how does auth work?")
92
+
93
+ await store.close()
94
+ ```
95
+ """
96
+
97
+ def __init__(self, session_id: str):
98
+ self.session_id = self._sanitize_id(session_id)
99
+ self.collection_name = f"repo_{self.session_id}"
100
+
101
+ # Qdrant 存储
102
+ self._qdrant: Optional[QdrantVectorStore] = None
103
+
104
+ # BM25 索引 (内存)
105
+ self._bm25: Optional[BM25Okapi] = None
106
+ self._doc_store: List[Document] = []
107
+ self._indexed_files: Set[str] = set()
108
+
109
+ # 上下文
110
+ self.repo_url: Optional[str] = None
111
+ self.global_context: Dict[str, Any] = {}
112
+
113
+ # 文件路径
114
+ self._context_file = os.path.join(config.context_dir, f"{self.session_id}.json")
115
+ self._cache_file = os.path.join(config.context_dir, f"{self.session_id}_bm25.pkl")
116
+
117
+ self._initialized = False
118
+
119
+ @staticmethod
120
+ def _sanitize_id(session_id: str) -> str:
121
+ """清理 session ID"""
122
+ clean = re.sub(r'[^a-zA-Z0-9_-]', '', session_id)
123
+ if not clean:
124
+ raise ValueError("Invalid session_id")
125
+ return clean
126
+
127
+ async def initialize(self) -> None:
128
+ """初始化存储"""
129
+ if self._initialized:
130
+ return
131
+
132
+ # 初始化 Qdrant
133
+ factory = get_qdrant_factory()
134
+ self._qdrant = factory.create(self.collection_name)
135
+ await self._qdrant.initialize()
136
+
137
+ # 加载本地状态
138
+ await self._load_state()
139
+
140
+ self._initialized = True
141
+ logger.debug(f"✅ VectorStore 初始化: {self.session_id}")
142
+
143
+ async def close(self) -> None:
144
+ """关闭连接"""
145
+ if self._qdrant:
146
+ await self._qdrant.close()
147
+ self._qdrant = None
148
+ self._initialized = False
149
+
150
+ async def _load_state(self) -> None:
151
+ """加载状态"""
152
+ # 1. 加载上下文 JSON
153
+ if os.path.exists(self._context_file):
154
+ try:
155
+ with open(self._context_file, 'r', encoding='utf-8') as f:
156
+ data = json.load(f)
157
+ self.repo_url = data.get("repo_url")
158
+ self.global_context = data.get("global_context", {})
159
+ except Exception as e:
160
+ logger.warning(f"加载上下文失败: {e}")
161
+
162
+ # 2. 尝试加载 BM25 缓存
163
+ cache_loaded = False
164
+ if os.path.exists(self._cache_file):
165
+ try:
166
+ with open(self._cache_file, 'rb') as f:
167
+ cache = pickle.load(f)
168
+ if isinstance(cache, dict) and cache.get("version") == config.cache_version:
169
+ self._bm25 = cache.get("bm25")
170
+ self._doc_store = cache.get("doc_store", [])
171
+ self._indexed_files = cache.get("indexed_files", set())
172
+ cache_loaded = True
173
+ logger.debug(f"📦 BM25 缓存命中: {len(self._doc_store)} 文档")
174
+ except Exception as e:
175
+ logger.warning(f"BM25 缓存损坏: {e}")
176
+ os.remove(self._cache_file)
177
+
178
+ # 3. 缓存未命中: 从 Qdrant 重建
179
+ if not cache_loaded and self._qdrant:
180
+ await self._rebuild_bm25_index()
181
+
182
+ async def _rebuild_bm25_index(self) -> None:
183
+ """从 Qdrant 重建 BM25 索引"""
184
+ logger.info(f"🔄 重建 BM25 索引: {self.session_id}")
185
+
186
+ documents = await self._qdrant.get_all_documents()
187
+
188
+ if documents:
189
+ self._doc_store = documents
190
+ self._indexed_files = {doc.file_path for doc in documents if doc.file_path}
191
+
192
+ tokenized = [self._tokenize(doc.content) for doc in documents]
193
+ if tokenized:
194
+ self._bm25 = BM25Okapi(tokenized)
195
+
196
+ self._save_bm25_cache()
197
+ logger.info(f"✅ BM25 索引重建完成: {len(documents)} 文档")
198
+
199
+ def _save_bm25_cache(self) -> None:
200
+ """保存 BM25 缓存 (原子写入)"""
201
+ if not self._doc_store:
202
+ return
203
+
204
+ try:
205
+ fd, tmp_path = tempfile.mkstemp(dir=config.context_dir)
206
+ with os.fdopen(fd, 'wb') as f:
207
+ pickle.dump({
208
+ "version": config.cache_version,
209
+ "bm25": self._bm25,
210
+ "doc_store": self._doc_store,
211
+ "indexed_files": self._indexed_files,
212
+ }, f)
213
+
214
+ if os.path.exists(self._cache_file):
215
+ os.remove(self._cache_file)
216
+ os.rename(tmp_path, self._cache_file)
217
+
218
+ except Exception as e:
219
+ logger.error(f"保存 BM25 缓存失败: {e}")
220
+
221
+ def _tokenize(self, text: str) -> List[str]:
222
+ """分词"""
223
+ return [
224
+ t.lower() for t in re.split(config.tokenize_regex, text)
225
+ if t.strip()
226
+ ]
227
+
228
+ async def save_context(self, repo_url: str, context_data: Dict[str, Any]) -> None:
229
+ """保存仓库上下文 (异步,不阻塞事件循环)"""
230
+ self.repo_url = repo_url
231
+ self.global_context = context_data
232
+ await asyncio.to_thread(self._write_context_file, {
233
+ "repo_url": repo_url,
234
+ "global_context": context_data,
235
+ })
236
+
237
+ def _write_context_file(self, updates: Dict[str, Any]) -> None:
238
+ """写入上下文文件 (同步,供线程池调用)"""
239
+ try:
240
+ existing = {}
241
+ if os.path.exists(self._context_file):
242
+ with open(self._context_file, 'r', encoding='utf-8') as f:
243
+ existing = json.load(f)
244
+ existing.update(updates)
245
+ with open(self._context_file, 'w', encoding='utf-8') as f:
246
+ json.dump(existing, f, ensure_ascii=False, indent=2)
247
+ except Exception as e:
248
+ logger.error(f"写入上下文失败: {e}")
249
+
250
+ async def save_report(self, report: str, language: str = "en") -> None:
251
+ """保存技术报告 (异步,不阻塞事件循环)"""
252
+ await asyncio.to_thread(self._write_report, report, language)
253
+
254
+ def _write_report(self, report: str, language: str) -> None:
255
+ """写入报告 (同步,供线程池调用)"""
256
+ try:
257
+ existing = {}
258
+ if os.path.exists(self._context_file):
259
+ with open(self._context_file, 'r', encoding='utf-8') as f:
260
+ existing = json.load(f)
261
+
262
+ if "reports" not in existing:
263
+ existing["reports"] = {}
264
+ existing["reports"][language] = report
265
+ existing["report"] = report
266
+ existing["report_language"] = language
267
+
268
+ with open(self._context_file, 'w', encoding='utf-8') as f:
269
+ json.dump(existing, f, ensure_ascii=False, indent=2)
270
+ logger.info(f"📝 报告已保存: {self.session_id} ({language})")
271
+ except Exception as e:
272
+ logger.error(f"保存报告失败: {e}")
273
+
274
+ def get_report(self, language: str = "en") -> Optional[str]:
275
+ """
276
+ 获取指定语言的报告
277
+
278
+ Args:
279
+ language: 语言代码 ('en', 'zh')
280
+
281
+ Returns:
282
+ 报告内容,不存在返回 None
283
+ """
284
+ context = self.load_context()
285
+ if not context:
286
+ return None
287
+
288
+ # 优先从 reports 字典获取
289
+ reports = context.get("reports", {})
290
+ if language in reports:
291
+ return reports[language]
292
+
293
+ # 兼容旧格式:如果只有 report 字段且语言匹配
294
+ if "report" in context:
295
+ stored_lang = context.get("report_language", "en")
296
+ if stored_lang == language:
297
+ return context["report"]
298
+
299
+ return None
300
+
301
+ def get_available_languages(self) -> List[str]:
302
+ """获取已有报告的语言列表"""
303
+ context = self.load_context()
304
+ if not context:
305
+ return []
306
+
307
+ reports = context.get("reports", {})
308
+ return list(reports.keys())
309
+
310
+ def load_context(self) -> Optional[Dict[str, Any]]:
311
+ """
312
+ 加载仓库上下文
313
+
314
+ Returns:
315
+ 包含 repo_url, global_context, report 等的字典,不存在返回 None
316
+ """
317
+ if not os.path.exists(self._context_file):
318
+ return None
319
+
320
+ try:
321
+ with open(self._context_file, 'r', encoding='utf-8') as f:
322
+ data = json.load(f)
323
+
324
+ # 恢复内存状态
325
+ self.repo_url = data.get("repo_url")
326
+ self.global_context = data.get("global_context", {})
327
+
328
+ return data
329
+ except Exception as e:
330
+ logger.error(f"加载上下文失败: {e}")
331
+ return None
332
+
333
+ def has_index(self) -> bool:
334
+ """检查是否已有索引"""
335
+ context = self.load_context()
336
+ return context is not None and context.get("repo_url") is not None
337
+
338
+ async def reset(self) -> None:
339
+ """重置存储 (分析新仓库时调用)"""
340
+ await self.initialize()
341
+
342
+ # 删除 Qdrant 集合
343
+ if self._qdrant:
344
+ await self._qdrant.delete_collection()
345
+ await self._qdrant.initialize()
346
+
347
+ # 清理本地文件
348
+ for f in [self._context_file, self._cache_file]:
349
+ if os.path.exists(f):
350
+ os.remove(f)
351
+
352
+ # 重置内存状态
353
+ self._bm25 = None
354
+ self._doc_store = []
355
+ self._indexed_files = set()
356
+ self.repo_url = None
357
+ self.global_context = {}
358
+
359
+ logger.info(f"🗑️ 重置存储: {self.session_id}")
360
+
361
+ # 兼容旧接口
362
+ def reset_collection(self) -> None:
363
+ """同步重置 (兼容旧代码)"""
364
+ asyncio.get_event_loop().run_until_complete(self.reset())
365
+
366
+ async def add_documents(
367
+ self,
368
+ documents: List[str],
369
+ metadatas: List[Dict[str, Any]]
370
+ ) -> int:
371
+ """
372
+ 添加文档
373
+
374
+ Args:
375
+ documents: 文档内容列表
376
+ metadatas: 元数据列表
377
+
378
+ Returns:
379
+ 成功添加的数量
380
+ """
381
+ if not documents:
382
+ return 0
383
+
384
+ await self.initialize()
385
+
386
+ # 1. 批量获取 Embedding
387
+ logger.info(f"📊 Embedding: {len(documents)} 个文档")
388
+ embedding_service = get_embedding()
389
+ embeddings = await embedding_service.embed_batch(documents, show_progress=True)
390
+
391
+ # 过滤无效的
392
+ valid_indices = [i for i, emb in enumerate(embeddings) if emb]
393
+ if not valid_indices:
394
+ logger.error("所有 Embedding 都失败了")
395
+ return 0
396
+
397
+ # 2. 构建 Document 对象
398
+ docs = []
399
+ for i in valid_indices:
400
+ doc_id = f"{metadatas[i].get('file', 'unknown')}_{len(self._doc_store) + len(docs)}"
401
+ doc = Document(
402
+ id=doc_id,
403
+ content=documents[i],
404
+ metadata=metadatas[i],
405
+ )
406
+ docs.append(doc)
407
+
408
+ valid_embeddings = [embeddings[i] for i in valid_indices]
409
+
410
+ # 3. 写入 Qdrant
411
+ added = await self._qdrant.add_documents(docs, valid_embeddings)
412
+
413
+ # 4. 更新 BM25 索引 (放入线程池,避免阻塞)
414
+ self._doc_store.extend(docs)
415
+ self._indexed_files.update(doc.file_path for doc in docs)
416
+
417
+ await asyncio.to_thread(self._rebuild_bm25_sync)
418
+
419
+ return added
420
+
421
+ def _rebuild_bm25_sync(self) -> None:
422
+ """重建 BM25 索引 (同步,用于线程池)"""
423
+ tokenized = [self._tokenize(doc.content) for doc in self._doc_store]
424
+ self._bm25 = BM25Okapi(tokenized)
425
+ self._save_bm25_cache()
426
+
427
+ async def embed_text(self, text: str) -> List[float]:
428
+ """获取文本 Embedding"""
429
+ embedding_service = get_embedding()
430
+ return await embedding_service.embed_text(text)
431
+
432
+ async def search_hybrid(
433
+ self,
434
+ query: str,
435
+ top_k: int = None
436
+ ) -> List[Dict[str, Any]]:
437
+ """
438
+ 混合搜索 (向量 + BM25,RRF 融合)
439
+
440
+ Args:
441
+ query: ���询文本
442
+ top_k: 返回数量
443
+
444
+ Returns:
445
+ 搜索结果列表
446
+ """
447
+ await self.initialize()
448
+
449
+ top_k = top_k or config.default_top_k
450
+ candidate_k = top_k * config.search_oversample
451
+
452
+ # 1. 向量搜索
453
+ vector_results: List[SearchResult] = []
454
+ query_embedding = await self.embed_text(query)
455
+
456
+ if query_embedding and self._qdrant:
457
+ vector_results = await self._qdrant.search(
458
+ query_embedding,
459
+ top_k=candidate_k
460
+ )
461
+
462
+ # 2. BM25 搜索
463
+ bm25_results: List[SearchResult] = []
464
+ if self._bm25 and self._doc_store:
465
+ tokens = self._tokenize(query)
466
+ if not tokens:
467
+ tokens = [""]
468
+
469
+ try:
470
+ scores = self._bm25.get_scores(tokens)
471
+ top_indices = sorted(
472
+ range(len(scores)),
473
+ key=lambda i: scores[i],
474
+ reverse=True
475
+ )[:candidate_k]
476
+
477
+ for idx in top_indices:
478
+ if scores[idx] > 0:
479
+ doc = self._doc_store[idx]
480
+ bm25_results.append(SearchResult(
481
+ document=doc,
482
+ score=scores[idx],
483
+ source="bm25",
484
+ ))
485
+ except Exception as e:
486
+ logger.error(f"BM25 搜索失败: {e}")
487
+
488
+ # 3. RRF 融合
489
+ fused = self._rrf_fusion(vector_results, bm25_results)
490
+
491
+ # 4. 格式化输出 (兼容旧接口)
492
+ results = []
493
+ for item in fused[:top_k]:
494
+ doc = item.document
495
+ results.append({
496
+ "id": doc.id,
497
+ "content": doc.content,
498
+ "file": doc.file_path,
499
+ "metadata": doc.metadata,
500
+ "score": item.score,
501
+ })
502
+
503
+ return results
504
+
505
+ def _rrf_fusion(
506
+ self,
507
+ vector_results: List[SearchResult],
508
+ bm25_results: List[SearchResult]
509
+ ) -> List[SearchResult]:
510
+ """RRF (Reciprocal Rank Fusion) 融合"""
511
+ k = config.rrf_k
512
+ fused: Dict[str, Dict] = {}
513
+
514
+ # 向量结果
515
+ for rank, result in enumerate(vector_results):
516
+ doc_id = result.document.id
517
+ if doc_id not in fused:
518
+ fused[doc_id] = {"result": result, "score": 0}
519
+ fused[doc_id]["score"] += config.rrf_weight_vector / (k + rank + 1)
520
+
521
+ # BM25 结果
522
+ for rank, result in enumerate(bm25_results):
523
+ doc_id = result.document.id
524
+ if doc_id not in fused:
525
+ fused[doc_id] = {"result": result, "score": 0}
526
+ fused[doc_id]["score"] += config.rrf_weight_bm25 / (k + rank + 1)
527
+
528
+ # 排序
529
+ sorted_items = sorted(
530
+ fused.values(),
531
+ key=lambda x: x["score"],
532
+ reverse=True
533
+ )
534
+
535
+ return [
536
+ SearchResult(
537
+ document=item["result"].document,
538
+ score=item["score"],
539
+ source="hybrid",
540
+ )
541
+ for item in sorted_items
542
+ ]
543
+
544
+ def get_documents_by_file(self, file_path: str) -> List[Dict[str, Any]]:
545
+ """根据文件路径获取文档 (兼容旧接口)"""
546
+ docs = [
547
+ doc for doc in self._doc_store
548
+ if doc.file_path == file_path
549
+ ]
550
+
551
+ result = []
552
+ for doc in sorted(docs, key=lambda d: d.metadata.get("start_line", 0)):
553
+ result.append({
554
+ "id": doc.id,
555
+ "content": doc.content,
556
+ "file": doc.file_path,
557
+ "metadata": doc.metadata,
558
+ "score": 1.0,
559
+ })
560
+
561
+ return result
562
+
563
+ @property
564
+ def indexed_files(self) -> Set[str]:
565
+ """已索引的文件"""
566
+ return self._indexed_files
567
+
568
+
569
+ # ============================================================
570
+ # 管理器 - LRU Cache + 过期清理
571
+ # ============================================================
572
+
573
+ class SessionEntry:
574
+ """Session 条目 - 包含存储实例和访问时间"""
575
+ __slots__ = ('store', 'last_access', 'created_at')
576
+
577
+ def __init__(self, store: VectorStore):
578
+ self.store = store
579
+ self.last_access = time.time()
580
+ self.created_at = time.time()
581
+
582
+ def touch(self) -> None:
583
+ """更新访问时间"""
584
+ self.last_access = time.time()
585
+
586
+
587
+ class VectorStoreManager:
588
+ """
589
+ 向量存储管理器 - LRU Cache 实现
590
+
591
+ 特性:
592
+ 1. LRU 淘汰 - 超过 max_count 时淘汰最久未访问的内存中的 session
593
+ 2. 仓库数据永久存储 - 不清理仓库索引和报告
594
+ 3. 线程安全 - 使用 asyncio.Lock
595
+ """
596
+
597
+ def __init__(self, max_count: int = None):
598
+ self._max_count = max_count or config.session_max_count
599
+ self._sessions: Dict[str, SessionEntry] = {}
600
+ self._lock = asyncio.Lock()
601
+
602
+ def get_store(self, session_id: str) -> VectorStore:
603
+ """
604
+ 获取或创建存储实例 (同步接口,兼容现有代码)
605
+
606
+ 会触发 LRU 淘汰检查
607
+ """
608
+ if session_id in self._sessions:
609
+ entry = self._sessions[session_id]
610
+ entry.touch()
611
+ # 移动到最后(模拟 LRU)
612
+ self._sessions.pop(session_id)
613
+ self._sessions[session_id] = entry
614
+ return entry.store
615
+
616
+ # 创建新 session
617
+ store = VectorStore(session_id)
618
+ entry = SessionEntry(store)
619
+ self._sessions[session_id] = entry
620
+
621
+ # 检查是否需要 LRU 淘汰(异步执行)
622
+ if len(self._sessions) > self._max_count:
623
+ asyncio.create_task(self._evict_lru())
624
+
625
+ logger.info(f"📦 Session 创建: {session_id} (总数: {len(self._sessions)})")
626
+ return store
627
+
628
+ async def _evict_lru(self) -> None:
629
+ """淘汰最久未访问的 session"""
630
+ async with self._lock:
631
+ while len(self._sessions) > self._max_count:
632
+ # 找到最久未访问的
633
+ oldest_id = min(
634
+ self._sessions.keys(),
635
+ key=lambda k: self._sessions[k].last_access
636
+ )
637
+ entry = self._sessions.pop(oldest_id)
638
+ await entry.store.close()
639
+ logger.info(f"🗑️ LRU 淘汰: {oldest_id}")
640
+
641
+ async def close_session(self, session_id: str) -> None:
642
+ """关闭指定 session"""
643
+ async with self._lock:
644
+ if session_id in self._sessions:
645
+ entry = self._sessions.pop(session_id)
646
+ await entry.store.close()
647
+ logger.info(f"🔒 Session 关闭: {session_id}")
648
+
649
+ async def close_all(self) -> None:
650
+ """关闭所有连接"""
651
+ async with self._lock:
652
+ for session_id, entry in list(self._sessions.items()):
653
+ await entry.store.close()
654
+ self._sessions.clear()
655
+ logger.info("🔒 所有 Session 已关闭")
656
+
657
+ def get_stats(self) -> Dict[str, Any]:
658
+ """获取管理器统计信息"""
659
+ now = time.time()
660
+ sessions_info = []
661
+ for sid, entry in self._sessions.items():
662
+ sessions_info.append({
663
+ "session_id": sid,
664
+ "age_hours": round((now - entry.created_at) / 3600, 2),
665
+ "idle_minutes": round((now - entry.last_access) / 60, 2),
666
+ })
667
+
668
+ return {
669
+ "total_sessions": len(self._sessions),
670
+ "max_sessions": self._max_count,
671
+ "sessions": sorted(sessions_info, key=lambda x: x["idle_minutes"], reverse=True)
672
+ }
673
+
674
+
675
+ # 全局管理器
676
+ store_manager = VectorStoreManager()
app/storage/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 存储层模块
4
+
5
+ 提供向量存储的抽象和实现
6
+ """
7
+
8
+ from app.storage.base import (
9
+ Document,
10
+ SearchResult,
11
+ CollectionStats,
12
+ StorageBackend,
13
+ BaseVectorStore,
14
+ )
15
+ from app.storage.qdrant_store import (
16
+ QdrantConfig,
17
+ QdrantVectorStore,
18
+ QdrantStoreFactory,
19
+ get_qdrant_factory,
20
+ )
21
+
22
+ __all__ = [
23
+ # 基础类型
24
+ "Document",
25
+ "SearchResult",
26
+ "CollectionStats",
27
+ "StorageBackend",
28
+ "BaseVectorStore",
29
+ # Qdrant
30
+ "QdrantConfig",
31
+ "QdrantVectorStore",
32
+ "QdrantStoreFactory",
33
+ "get_qdrant_factory",
34
+ ]
app/storage/base.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 向量存储抽象层
4
+
5
+ 设计原则:
6
+ 1. 接口与实现分离 - 易于切换存储后端
7
+ 2. 异步优先 - 所有 I/O 操作都是异步的
8
+ 3. 类型安全 - 完整的类型注解
9
+ 4. 可观测 - 内置指标收集
10
+ """
11
+
12
+ from abc import ABC, abstractmethod
13
+ from dataclasses import dataclass, field
14
+ from typing import List, Dict, Any, Optional, Set
15
+ from enum import Enum
16
+ import logging
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # ============================================================
22
+ # 数据模型
23
+ # ============================================================
24
+
25
+ @dataclass
26
+ class Document:
27
+ """文档数据模型"""
28
+ id: str
29
+ content: str
30
+ metadata: Dict[str, Any] = field(default_factory=dict)
31
+ embedding: Optional[List[float]] = None
32
+
33
+ @property
34
+ def file_path(self) -> str:
35
+ return self.metadata.get("file", "")
36
+
37
+ def to_dict(self) -> Dict[str, Any]:
38
+ return {
39
+ "id": self.id,
40
+ "content": self.content,
41
+ "metadata": self.metadata,
42
+ }
43
+
44
+
45
+ @dataclass
46
+ class SearchResult:
47
+ """搜索结果"""
48
+ document: Document
49
+ score: float
50
+ source: str = "vector" # "vector" | "bm25" | "hybrid"
51
+
52
+ def to_dict(self) -> Dict[str, Any]:
53
+ return {
54
+ "id": self.document.id,
55
+ "content": self.document.content,
56
+ "file": self.document.file_path,
57
+ "metadata": self.document.metadata,
58
+ "score": self.score,
59
+ "source": self.source,
60
+ }
61
+
62
+
63
+ @dataclass
64
+ class CollectionStats:
65
+ """集合统计信息"""
66
+ name: str
67
+ document_count: int
68
+ indexed_files: Set[str] = field(default_factory=set)
69
+ vector_dimension: int = 0
70
+
71
+
72
+ class StorageBackend(Enum):
73
+ """存储后端类型"""
74
+ QDRANT = "qdrant"
75
+ CHROMA = "chroma" # 保留兼容性
76
+
77
+
78
+ # ============================================================
79
+ # 抽象基类
80
+ # ============================================================
81
+
82
+ class BaseVectorStore(ABC):
83
+ """
84
+ 向量存储抽象基类
85
+
86
+ 所有存储后端必须实现这些方法
87
+ """
88
+
89
+ @abstractmethod
90
+ async def initialize(self) -> None:
91
+ """初始化存储连接"""
92
+ pass
93
+
94
+ @abstractmethod
95
+ async def close(self) -> None:
96
+ """关闭连接"""
97
+ pass
98
+
99
+ @abstractmethod
100
+ async def add_documents(
101
+ self,
102
+ documents: List[Document],
103
+ embeddings: List[List[float]]
104
+ ) -> int:
105
+ """
106
+ 添加文档
107
+
108
+ Args:
109
+ documents: 文档列表
110
+ embeddings: 对应的嵌入向量
111
+
112
+ Returns:
113
+ 成功添加的文档数量
114
+ """
115
+ pass
116
+
117
+ @abstractmethod
118
+ async def search(
119
+ self,
120
+ query_embedding: List[float],
121
+ top_k: int = 10,
122
+ filter_conditions: Optional[Dict[str, Any]] = None
123
+ ) -> List[SearchResult]:
124
+ """
125
+ 向量相似度搜索
126
+
127
+ Args:
128
+ query_embedding: 查询向量
129
+ top_k: 返回数量
130
+ filter_conditions: 过滤条件
131
+
132
+ Returns:
133
+ 搜索结果列表
134
+ """
135
+ pass
136
+
137
+ @abstractmethod
138
+ async def delete_collection(self) -> bool:
139
+ """删除当前集合"""
140
+ pass
141
+
142
+ @abstractmethod
143
+ async def get_stats(self) -> CollectionStats:
144
+ """获取集合统计信息"""
145
+ pass
146
+
147
+ @abstractmethod
148
+ async def get_documents_by_file(self, file_path: str) -> List[Document]:
149
+ """根据文件路径获取文档"""
150
+ pass
151
+
152
+
153
+ class BaseVectorStoreFactory(ABC):
154
+ """向量存储工厂基类"""
155
+
156
+ @abstractmethod
157
+ def create(self, collection_name: str) -> BaseVectorStore:
158
+ """创建存储实例"""
159
+ pass
app/storage/qdrant_store.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Qdrant 向量存储实现
4
+
5
+ 特性:
6
+ 1. 异步原生 - 使用 qdrant-client AsyncQdrantClient
7
+ 2. 高性能 - 批量 upsert、HNSW 索引、payload 索引
8
+ 3. 混合搜索 - 向量 + 稀疏向量 (FastEmbed)
9
+ 4. 连接池 - gRPC 长连接复用
10
+ 5. 可观测 - 完整的日志和指标
11
+ """
12
+
13
+ import asyncio
14
+ import logging
15
+ import os
16
+ from dataclasses import dataclass
17
+ from typing import List, Dict, Any, Optional, Set
18
+ from contextlib import asynccontextmanager
19
+
20
+ from qdrant_client import AsyncQdrantClient, models
21
+ from qdrant_client.models import (
22
+ Distance,
23
+ VectorParams,
24
+ PointStruct,
25
+ Filter,
26
+ FieldCondition,
27
+ MatchValue,
28
+ PayloadSchemaType,
29
+ )
30
+
31
+ from app.storage.base import (
32
+ BaseVectorStore,
33
+ Document,
34
+ SearchResult,
35
+ CollectionStats,
36
+ )
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ # ============================================================
42
+ # 配置
43
+ # ============================================================
44
+
45
+ @dataclass
46
+ class QdrantConfig:
47
+ """
48
+ Qdrant 配置
49
+
50
+ 支持三种模式:
51
+ - local: 本地嵌入式 (开发/单进程)
52
+ - server: Qdrant Server (多 Worker 生产环境)
53
+ - cloud: Qdrant Cloud (托管服务)
54
+
55
+ 环境变量:
56
+ - QDRANT_MODE: "local" | "server" | "cloud"
57
+ - QDRANT_URL: 服务器地址 (server/cloud 模式)
58
+ - QDRANT_API_KEY: API 密钥 (cloud 模式必需)
59
+ - QDRANT_LOCAL_PATH: 本地存储路径 (local 模式)
60
+ """
61
+ # 模式: "local" | "server" | "cloud"
62
+ mode: str = "local"
63
+
64
+ # Server/Cloud 模式配置
65
+ url: Optional[str] = None
66
+ host: str = "localhost"
67
+ port: int = 6333
68
+ grpc_port: int = 6334
69
+ prefer_grpc: bool = True
70
+ api_key: Optional[str] = None
71
+
72
+ # Local 模式配置
73
+ local_path: str = "data/qdrant_db"
74
+
75
+ # 向量配置
76
+ vector_size: int = 1024 # BGE-M3 维度
77
+ distance: Distance = Distance.COSINE
78
+
79
+ # 索引配置
80
+ hnsw_m: int = 16 # HNSW 图的边数
81
+ hnsw_ef_construct: int = 100 # 构建时的搜索深度
82
+
83
+ # 批量操作
84
+ batch_size: int = 100
85
+
86
+ # 超时
87
+ timeout: float = 30.0
88
+
89
+ @classmethod
90
+ def from_env(cls) -> "QdrantConfig":
91
+ """从环境变量加载配置"""
92
+ mode = os.getenv("QDRANT_MODE", "local").lower()
93
+
94
+ return cls(
95
+ mode=mode,
96
+ url=os.getenv("QDRANT_URL"),
97
+ host=os.getenv("QDRANT_HOST", "localhost"),
98
+ port=int(os.getenv("QDRANT_PORT", "6333")),
99
+ grpc_port=int(os.getenv("QDRANT_GRPC_PORT", "6334")),
100
+ api_key=os.getenv("QDRANT_API_KEY"),
101
+ local_path=os.getenv("QDRANT_LOCAL_PATH", "data/qdrant_db"),
102
+ vector_size=int(os.getenv("QDRANT_VECTOR_SIZE", "1024")),
103
+ prefer_grpc=os.getenv("QDRANT_PREFER_GRPC", "true").lower() == "true",
104
+ )
105
+
106
+ @property
107
+ def is_local(self) -> bool:
108
+ return self.mode == "local"
109
+
110
+ @property
111
+ def is_server(self) -> bool:
112
+ return self.mode == "server"
113
+
114
+ @property
115
+ def is_cloud(self) -> bool:
116
+ return self.mode == "cloud"
117
+
118
+ def validate(self) -> None:
119
+ """验证配置"""
120
+ if self.is_cloud and not self.api_key:
121
+ raise ValueError("QDRANT_API_KEY is required for cloud mode")
122
+ if (self.is_server or self.is_cloud) and not (self.url or self.host):
123
+ raise ValueError("QDRANT_URL or QDRANT_HOST is required for server/cloud mode")
124
+
125
+
126
+ # ============================================================
127
+ # 全局共享客户端单例
128
+ # ============================================================
129
+
130
+ _shared_client: Optional[AsyncQdrantClient] = None
131
+ _shared_config: Optional[QdrantConfig] = None
132
+ _client_lock = asyncio.Lock()
133
+
134
+
135
+ async def get_shared_client(config: Optional[QdrantConfig] = None) -> AsyncQdrantClient:
136
+ """
137
+ 获取共享的 Qdrant 客户端单例
138
+
139
+ 支持三种模式:
140
+ - local: 本地嵌入式存储 (单进程,开发环境)
141
+ - server: Qdrant Server (多 Worker,Docker 部署)
142
+ - cloud: Qdrant Cloud (托管服务)
143
+ """
144
+ global _shared_client, _shared_config
145
+
146
+ async with _client_lock:
147
+ if _shared_client is None:
148
+ _shared_config = config or QdrantConfig.from_env()
149
+ _shared_config.validate()
150
+
151
+ if _shared_config.is_local:
152
+ # Local 模式: 嵌入式存储
153
+ os.makedirs(_shared_config.local_path, exist_ok=True)
154
+ _shared_client = AsyncQdrantClient(
155
+ path=_shared_config.local_path,
156
+ timeout=_shared_config.timeout,
157
+ )
158
+ logger.info(f"📦 Qdrant 本地模式: {_shared_config.local_path}")
159
+
160
+ elif _shared_config.is_server:
161
+ # Server 模式: 连接 Qdrant Server
162
+ if _shared_config.url:
163
+ _shared_client = AsyncQdrantClient(
164
+ url=_shared_config.url,
165
+ prefer_grpc=_shared_config.prefer_grpc,
166
+ timeout=_shared_config.timeout,
167
+ )
168
+ logger.info(f"🌐 Qdrant Server 模式: {_shared_config.url}")
169
+ else:
170
+ _shared_client = AsyncQdrantClient(
171
+ host=_shared_config.host,
172
+ port=_shared_config.port,
173
+ grpc_port=_shared_config.grpc_port,
174
+ prefer_grpc=_shared_config.prefer_grpc,
175
+ timeout=_shared_config.timeout,
176
+ )
177
+ logger.info(f"🌐 Qdrant Server 模式: {_shared_config.host}:{_shared_config.port}")
178
+
179
+ else:
180
+ # Cloud 模式: 连接 Qdrant Cloud
181
+ _shared_client = AsyncQdrantClient(
182
+ url=_shared_config.url,
183
+ api_key=_shared_config.api_key,
184
+ timeout=_shared_config.timeout,
185
+ )
186
+ logger.info(f"☁️ Qdrant Cloud 模式: {_shared_config.url}")
187
+
188
+ return _shared_client
189
+
190
+ return _shared_client
191
+
192
+
193
+ async def close_shared_client() -> None:
194
+ """关闭共享客户端"""
195
+ global _shared_client
196
+ if _shared_client is not None:
197
+ await _shared_client.close()
198
+ _shared_client = None
199
+ logger.info("🔒 Qdrant 共享客户端已关闭")
200
+
201
+
202
+ # ============================================================
203
+ # Qdrant 存储实现
204
+ # ============================================================
205
+
206
+ class QdrantVectorStore(BaseVectorStore):
207
+ """
208
+ Qdrant 向量存储
209
+
210
+ 使用示例:
211
+ ```python
212
+ config = QdrantConfig.from_env()
213
+ store = QdrantVectorStore("my_collection", config)
214
+
215
+ await store.initialize()
216
+
217
+ # 添加文档
218
+ docs = [Document(id="1", content="hello", metadata={"file": "a.py"})]
219
+ embeddings = [[0.1, 0.2, ...]]
220
+ await store.add_documents(docs, embeddings)
221
+
222
+ # 搜索
223
+ results = await store.search(query_embedding, top_k=5)
224
+
225
+ await store.close()
226
+ ```
227
+ """
228
+
229
+ # Payload 字段名常量
230
+ FIELD_CONTENT = "content"
231
+ FIELD_FILE = "file"
232
+ FIELD_METADATA = "metadata"
233
+
234
+ def __init__(
235
+ self,
236
+ collection_name: str,
237
+ config: Optional[QdrantConfig] = None
238
+ ):
239
+ self.collection_name = self._sanitize_name(collection_name)
240
+ self.config = config or QdrantConfig.from_env()
241
+ self._initialized = False
242
+
243
+ @staticmethod
244
+ def _sanitize_name(name: str) -> str:
245
+ """清理集合名称"""
246
+ import re
247
+ clean = re.sub(r'[^a-zA-Z0-9_-]', '_', name)
248
+ return clean[:63] if clean else "default"
249
+
250
+ async def _get_client(self) -> AsyncQdrantClient:
251
+ """获取共享客户端 (解决 Qdrant Local 并发访问问题)"""
252
+ return await get_shared_client(self.config)
253
+
254
+ async def initialize(self) -> None:
255
+ """初始化集合"""
256
+ if self._initialized:
257
+ return
258
+
259
+ client = await self._get_client()
260
+
261
+ # 检查集合是否存在
262
+ collections = await client.get_collections()
263
+ exists = any(c.name == self.collection_name for c in collections.collections)
264
+
265
+ if not exists:
266
+ # 创建集合
267
+ await client.create_collection(
268
+ collection_name=self.collection_name,
269
+ vectors_config=VectorParams(
270
+ size=self.config.vector_size,
271
+ distance=self.config.distance,
272
+ hnsw_config=models.HnswConfigDiff(
273
+ m=self.config.hnsw_m,
274
+ ef_construct=self.config.hnsw_ef_construct,
275
+ ),
276
+ ),
277
+ # 启用 payload 索引以加速过滤
278
+ optimizers_config=models.OptimizersConfigDiff(
279
+ indexing_threshold=0, # 立即索引
280
+ ),
281
+ )
282
+
283
+ # 创建 payload 索引
284
+ await client.create_payload_index(
285
+ collection_name=self.collection_name,
286
+ field_name=self.FIELD_FILE,
287
+ field_schema=PayloadSchemaType.KEYWORD,
288
+ )
289
+
290
+ logger.info(f"✅ 创建集合: {self.collection_name}")
291
+ else:
292
+ logger.debug(f"📂 集合已存在: {self.collection_name}")
293
+
294
+ self._initialized = True
295
+
296
+ async def close(self) -> None:
297
+ """
298
+ 关闭连接 (使用共享客户端时不实际关闭)
299
+
300
+ 注意: 由于使用共享客户端,单个 Store 的 close() 不会关闭客户端。
301
+ 全局关闭请使用 close_shared_client()
302
+ """
303
+ self._initialized = False
304
+ logger.debug(f"🔌 Store 已关闭: {self.collection_name}")
305
+
306
+ async def add_documents(
307
+ self,
308
+ documents: List[Document],
309
+ embeddings: List[List[float]]
310
+ ) -> int:
311
+ """批量添加文档"""
312
+ if not documents or not embeddings:
313
+ return 0
314
+
315
+ if len(documents) != len(embeddings):
316
+ raise ValueError(f"文档数量 ({len(documents)}) 与向量数量 ({len(embeddings)}) 不匹配")
317
+
318
+ await self.initialize()
319
+ client = await self._get_client()
320
+
321
+ # 过滤空向量
322
+ valid_pairs = [
323
+ (doc, emb) for doc, emb in zip(documents, embeddings)
324
+ if emb and len(emb) == self.config.vector_size
325
+ ]
326
+
327
+ if not valid_pairs:
328
+ logger.warning("没有有效的文档向量对")
329
+ return 0
330
+
331
+ # 构建 Points
332
+ points = []
333
+ for doc, embedding in valid_pairs:
334
+ point = PointStruct(
335
+ id=self._generate_point_id(doc.id),
336
+ vector=embedding,
337
+ payload={
338
+ self.FIELD_CONTENT: doc.content,
339
+ self.FIELD_FILE: doc.file_path,
340
+ self.FIELD_METADATA: doc.metadata,
341
+ "doc_id": doc.id,
342
+ },
343
+ )
344
+ points.append(point)
345
+
346
+ # 批量 upsert
347
+ total_added = 0
348
+ batch_size = self.config.batch_size
349
+
350
+ for i in range(0, len(points), batch_size):
351
+ batch = points[i:i + batch_size]
352
+ try:
353
+ await client.upsert(
354
+ collection_name=self.collection_name,
355
+ points=batch,
356
+ wait=True,
357
+ )
358
+ total_added += len(batch)
359
+ except Exception as e:
360
+ logger.error(f"批次 {i // batch_size + 1} 写入失败: {e}")
361
+
362
+ logger.info(f"✅ 写入 {total_added}/{len(points)} 个文档到 {self.collection_name}")
363
+ return total_added
364
+
365
+ def _generate_point_id(self, doc_id: str) -> int:
366
+ """生成数值型 Point ID (Qdrant 要求)"""
367
+ import hashlib
368
+ hash_bytes = hashlib.sha256(doc_id.encode()).digest()
369
+ # 取前 8 字节转为正整数
370
+ return int.from_bytes(hash_bytes[:8], byteorder='big') & 0x7FFFFFFFFFFFFFFF
371
+
372
+ async def search(
373
+ self,
374
+ query_embedding: List[float],
375
+ top_k: int = 10,
376
+ filter_conditions: Optional[Dict[str, Any]] = None
377
+ ) -> List[SearchResult]:
378
+ """向量相似度搜索"""
379
+ if not query_embedding:
380
+ return []
381
+
382
+ await self.initialize()
383
+ client = await self._get_client()
384
+
385
+ # 构建过滤器
386
+ query_filter = None
387
+ if filter_conditions:
388
+ must_conditions = []
389
+ for field, value in filter_conditions.items():
390
+ must_conditions.append(
391
+ FieldCondition(
392
+ key=field,
393
+ match=MatchValue(value=value),
394
+ )
395
+ )
396
+ query_filter = Filter(must=must_conditions)
397
+
398
+ try:
399
+ # 使用 query_points (qdrant-client >= 1.7.0)
400
+ results = await client.query_points(
401
+ collection_name=self.collection_name,
402
+ query=query_embedding,
403
+ limit=top_k,
404
+ query_filter=query_filter,
405
+ with_payload=True,
406
+ score_threshold=0.0,
407
+ )
408
+
409
+ search_results = []
410
+ for hit in results.points:
411
+ payload = hit.payload or {}
412
+ doc = Document(
413
+ id=payload.get("doc_id", str(hit.id)),
414
+ content=payload.get(self.FIELD_CONTENT, ""),
415
+ metadata=payload.get(self.FIELD_METADATA, {}),
416
+ )
417
+ search_results.append(SearchResult(
418
+ document=doc,
419
+ score=hit.score,
420
+ source="vector",
421
+ ))
422
+
423
+ return search_results
424
+
425
+ except Exception as e:
426
+ logger.error(f"搜索失败: {e}")
427
+ return []
428
+
429
+ async def delete_collection(self) -> bool:
430
+ """删除集合"""
431
+ try:
432
+ client = await self._get_client()
433
+ await client.delete_collection(self.collection_name)
434
+ self._initialized = False
435
+ logger.info(f"🗑️ 删除集合: {self.collection_name}")
436
+ return True
437
+ except Exception as e:
438
+ logger.error(f"删除集合失败: {e}")
439
+ return False
440
+
441
+ async def get_stats(self) -> CollectionStats:
442
+ """获取集合统计"""
443
+ await self.initialize()
444
+ client = await self._get_client()
445
+
446
+ try:
447
+ info = await client.get_collection(self.collection_name)
448
+
449
+ # 获取所有唯一文件
450
+ indexed_files: Set[str] = set()
451
+ scroll_result = await client.scroll(
452
+ collection_name=self.collection_name,
453
+ limit=10000,
454
+ with_payload=[self.FIELD_FILE],
455
+ )
456
+
457
+ for point in scroll_result[0]:
458
+ if point.payload:
459
+ file_path = point.payload.get(self.FIELD_FILE)
460
+ if file_path:
461
+ indexed_files.add(file_path)
462
+
463
+ return CollectionStats(
464
+ name=self.collection_name,
465
+ document_count=info.points_count or 0,
466
+ indexed_files=indexed_files,
467
+ vector_dimension=self.config.vector_size,
468
+ )
469
+ except Exception as e:
470
+ logger.error(f"获取统计失败: {e}")
471
+ return CollectionStats(name=self.collection_name, document_count=0)
472
+
473
+ async def get_documents_by_file(self, file_path: str) -> List[Document]:
474
+ """根据文件路径获取文档"""
475
+ await self.initialize()
476
+ client = await self._get_client()
477
+
478
+ try:
479
+ scroll_result = await client.scroll(
480
+ collection_name=self.collection_name,
481
+ scroll_filter=Filter(
482
+ must=[
483
+ FieldCondition(
484
+ key=self.FIELD_FILE,
485
+ match=MatchValue(value=file_path),
486
+ )
487
+ ]
488
+ ),
489
+ limit=1000,
490
+ with_payload=True,
491
+ )
492
+
493
+ documents = []
494
+ for point in scroll_result[0]:
495
+ payload = point.payload or {}
496
+ doc = Document(
497
+ id=payload.get("doc_id", str(point.id)),
498
+ content=payload.get(self.FIELD_CONTENT, ""),
499
+ metadata=payload.get(self.FIELD_METADATA, {}),
500
+ )
501
+ documents.append(doc)
502
+
503
+ # 按行号排序
504
+ documents.sort(key=lambda d: d.metadata.get("start_line", 0))
505
+ return documents
506
+
507
+ except Exception as e:
508
+ logger.error(f"获取文件文档失败: {e}")
509
+ return []
510
+
511
+ async def get_all_documents(self) -> List[Document]:
512
+ """获取所有文档 (用于 BM25 索引构建)"""
513
+ await self.initialize()
514
+ client = await self._get_client()
515
+
516
+ documents = []
517
+ offset = None
518
+
519
+ try:
520
+ while True:
521
+ scroll_result = await client.scroll(
522
+ collection_name=self.collection_name,
523
+ limit=1000,
524
+ offset=offset,
525
+ with_payload=True,
526
+ )
527
+
528
+ points, next_offset = scroll_result
529
+
530
+ for point in points:
531
+ payload = point.payload or {}
532
+ doc = Document(
533
+ id=payload.get("doc_id", str(point.id)),
534
+ content=payload.get(self.FIELD_CONTENT, ""),
535
+ metadata=payload.get(self.FIELD_METADATA, {}),
536
+ )
537
+ documents.append(doc)
538
+
539
+ if next_offset is None:
540
+ break
541
+ offset = next_offset
542
+
543
+ return documents
544
+
545
+ except Exception as e:
546
+ logger.error(f"获取所有文档失败: {e}")
547
+ return []
548
+
549
+
550
+ # ============================================================
551
+ # 工厂
552
+ # ============================================================
553
+
554
+ class QdrantStoreFactory:
555
+ """Qdrant 存储工厂"""
556
+
557
+ def __init__(self, config: Optional[QdrantConfig] = None):
558
+ self.config = config or QdrantConfig.from_env()
559
+
560
+ def create(self, collection_name: str) -> QdrantVectorStore:
561
+ """创建存储实例"""
562
+ return QdrantVectorStore(collection_name, self.config)
563
+
564
+ async def get_client(self) -> AsyncQdrantClient:
565
+ """获取共享的 Qdrant 客户端"""
566
+ return await get_shared_client(self.config)
567
+
568
+
569
+ # 全局工厂实例
570
+ _qdrant_factory: Optional[QdrantStoreFactory] = None
571
+
572
+
573
+ def get_qdrant_factory(config: Optional[QdrantConfig] = None) -> QdrantStoreFactory:
574
+ """获取工厂单例"""
575
+ global _qdrant_factory
576
+ if _qdrant_factory is None:
577
+ _qdrant_factory = QdrantStoreFactory(config)
578
+ return _qdrant_factory
app/utils/embedding.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Embedding 服务 - 并发优化版
4
+
5
+ 特性:
6
+ 1. 并发批量请求 - 使用 asyncio.gather 并行处理多个批次
7
+ 2. 信号量控制 - 限制最大并发数,避免 API 限流
8
+ 3. 重试机制 - 使用 tenacity 处理临时性错误
9
+ 4. 智能分批 - 根据 token 数量动态调整批次大小
10
+ """
11
+
12
+ import asyncio
13
+ import logging
14
+ from typing import List, Optional
15
+ from dataclasses import dataclass
16
+
17
+ from openai import AsyncOpenAI
18
+
19
+ from app.core.config import settings
20
+ from app.utils.retry import llm_retry, is_retryable_error
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass
26
+ class EmbeddingConfig:
27
+ """Embedding 服务配置"""
28
+ # API 配置
29
+ api_base_url: str = "https://api.siliconflow.cn/v1"
30
+ model_name: str = "BAAI/bge-m3"
31
+
32
+ # 批处理配置
33
+ batch_size: int = 50 # 每批文本数量
34
+ max_text_length: int = 8000 # 单个文本最大字符数
35
+
36
+ # 并发控制
37
+ max_concurrent_batches: int = 5 # 最大并发批次数
38
+
39
+ # 超时配置
40
+ timeout: int = 60 # 单次请求超时 (秒)
41
+
42
+
43
+ class EmbeddingService:
44
+ """
45
+ 高性能 Embedding 服务
46
+
47
+ 使用示例:
48
+ ```python
49
+ service = EmbeddingService()
50
+
51
+ # 单文本
52
+ embedding = await service.embed_text("Hello world")
53
+
54
+ # 批量文本 (自动并发优化)
55
+ texts = ["text1", "text2", ..., "text100"]
56
+ embeddings = await service.embed_batch(texts)
57
+ ```
58
+ """
59
+
60
+ def __init__(self, config: Optional[EmbeddingConfig] = None):
61
+ self.config = config or EmbeddingConfig()
62
+
63
+ # 初始化 OpenAI 客户端 (SiliconFlow 兼容 OpenAI 协议)
64
+ self._client = AsyncOpenAI(
65
+ api_key=settings.SILICON_API_KEY,
66
+ base_url=self.config.api_base_url,
67
+ timeout=self.config.timeout
68
+ )
69
+
70
+ # 并发信号量
71
+ self._semaphore = asyncio.Semaphore(self.config.max_concurrent_batches)
72
+
73
+ # 统计信息
74
+ self._stats = {
75
+ "total_requests": 0,
76
+ "successful_requests": 0,
77
+ "failed_requests": 0,
78
+ "total_texts": 0,
79
+ "retried_requests": 0
80
+ }
81
+
82
+ def _preprocess_text(self, text: str) -> str:
83
+ """预处理文本: 移除换行、截断长度"""
84
+ text = text.replace("\n", " ").strip()
85
+ if len(text) > self.config.max_text_length:
86
+ text = text[:self.config.max_text_length]
87
+ return text
88
+
89
+ @llm_retry
90
+ async def _embed_single_batch(self, texts: List[str]) -> List[List[float]]:
91
+ """
92
+ 处理单个批次的 Embedding 请求 (带重试)
93
+
94
+ Args:
95
+ texts: 预处理后的文本列表
96
+
97
+ Returns:
98
+ embedding 向量列表
99
+ """
100
+ self._stats["total_requests"] += 1
101
+
102
+ response = await self._client.embeddings.create(
103
+ input=texts,
104
+ model=self.config.model_name
105
+ )
106
+
107
+ self._stats["successful_requests"] += 1
108
+ return [item.embedding for item in response.data]
109
+
110
+ async def _embed_batch_with_semaphore(
111
+ self,
112
+ batch_texts: List[str],
113
+ batch_index: int
114
+ ) -> tuple[int, List[List[float]]]:
115
+ """
116
+ 带信号量控制的批次处理
117
+
118
+ Returns:
119
+ (batch_index, embeddings) - 返回索引用于结果排序
120
+ """
121
+ async with self._semaphore:
122
+ try:
123
+ embeddings = await self._embed_single_batch(batch_texts)
124
+ logger.debug(f"✅ 批次 {batch_index} 完成: {len(batch_texts)} 文本")
125
+ return (batch_index, embeddings)
126
+ except Exception as e:
127
+ self._stats["failed_requests"] += 1
128
+ logger.error(f"❌ 批次 {batch_index} 失败: {type(e).__name__}: {e}")
129
+ raise
130
+
131
+ async def embed_text(self, text: str) -> List[float]:
132
+ """
133
+ 获取单个文本的 Embedding
134
+
135
+ Args:
136
+ text: 输入文本
137
+
138
+ Returns:
139
+ embedding 向量,失败返回空列表
140
+ """
141
+ try:
142
+ processed = self._preprocess_text(text)
143
+ if not processed:
144
+ return []
145
+
146
+ self._stats["total_texts"] += 1
147
+ embeddings = await self._embed_single_batch([processed])
148
+ return embeddings[0] if embeddings else []
149
+ except Exception as e:
150
+ logger.error(f"embed_text 失败: {e}")
151
+ return []
152
+
153
+ async def embed_batch(
154
+ self,
155
+ texts: List[str],
156
+ show_progress: bool = False
157
+ ) -> List[List[float]]:
158
+ """
159
+ 批量获取 Embedding (并发优化)
160
+
161
+ Args:
162
+ texts: 文本列表
163
+ show_progress: 是否显示进度日志
164
+
165
+ Returns:
166
+ embedding 向量列表 (���输入顺序一致)
167
+ 失败的文本对应空列表
168
+ """
169
+ if not texts:
170
+ return []
171
+
172
+ # 预处理所有文本
173
+ processed_texts = [self._preprocess_text(t) for t in texts]
174
+ self._stats["total_texts"] += len(texts)
175
+
176
+ # 分批
177
+ batch_size = self.config.batch_size
178
+ batches = [
179
+ processed_texts[i:i + batch_size]
180
+ for i in range(0, len(processed_texts), batch_size)
181
+ ]
182
+
183
+ total_batches = len(batches)
184
+ if show_progress:
185
+ logger.info(
186
+ f"📊 Embedding: {len(texts)} 文本 → {total_batches} 批次 "
187
+ f"(并发: {self.config.max_concurrent_batches})"
188
+ )
189
+
190
+ # 并发执行所有批次
191
+ tasks = [
192
+ self._embed_batch_with_semaphore(batch, idx)
193
+ for idx, batch in enumerate(batches)
194
+ ]
195
+
196
+ # 收集结果
197
+ results = await asyncio.gather(*tasks, return_exceptions=True)
198
+
199
+ # 按批次索引排序并合并结果
200
+ embeddings = []
201
+ for result in sorted(results, key=lambda x: x[0] if isinstance(x, tuple) else float('inf')):
202
+ if isinstance(result, tuple):
203
+ batch_idx, batch_embeddings = result
204
+ embeddings.extend(batch_embeddings)
205
+ else:
206
+ # 异常情况: 填充空向量
207
+ # 找出这个批次有多少文本
208
+ failed_batch_size = batch_size # 保守估计
209
+ embeddings.extend([[] for _ in range(failed_batch_size)])
210
+ logger.warning(f"批次失败,填充 {failed_batch_size} 个空向量")
211
+
212
+ # 确保返回数量与输入一致
213
+ if len(embeddings) < len(texts):
214
+ embeddings.extend([[] for _ in range(len(texts) - len(embeddings))])
215
+ elif len(embeddings) > len(texts):
216
+ embeddings = embeddings[:len(texts)]
217
+
218
+ if show_progress:
219
+ success_count = sum(1 for e in embeddings if e)
220
+ logger.info(f"✅ Embedding 完成: {success_count}/{len(texts)} 成功")
221
+
222
+ return embeddings
223
+
224
+ def get_stats(self) -> dict:
225
+ """获取统计信息"""
226
+ return self._stats.copy()
227
+
228
+ def reset_stats(self):
229
+ """重置统计信息"""
230
+ for key in self._stats:
231
+ self._stats[key] = 0
232
+
233
+
234
+ # 全局单例
235
+ _embedding_service: Optional[EmbeddingService] = None
236
+
237
+
238
+ def get_embedding_service(config: Optional[EmbeddingConfig] = None) -> EmbeddingService:
239
+ """获取 Embedding 服务单例"""
240
+ global _embedding_service
241
+ if _embedding_service is None:
242
+ _embedding_service = EmbeddingService(config)
243
+ return _embedding_service
244
+
245
+
246
+ # 便捷函数
247
+ async def embed_text(text: str) -> List[float]:
248
+ """快捷方式: 获取单个文本的 Embedding"""
249
+ return await get_embedding_service().embed_text(text)
250
+
251
+
252
+ async def embed_batch(texts: List[str], show_progress: bool = False) -> List[List[float]]:
253
+ """快捷方式: 批量获取 Embedding"""
254
+ return await get_embedding_service().embed_batch(texts, show_progress)
app/utils/github_client.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ GitHub 异步客户端
4
+
5
+ 设计原则:
6
+ 1. 异步非阻塞 - 使用 httpx.AsyncClient
7
+ 2. 连接池复用 - 单例模式管理客户端生命周期
8
+ 3. 自动重试 - 集成 tenacity 处理瞬时错误
9
+ 4. 类型安全 - 完整的类型注解
10
+ 5. 可扩展 - 易于添加新的 API 端点
11
+ """
12
+
13
+ import asyncio
14
+ import base64
15
+ import logging
16
+ import os
17
+ from dataclasses import dataclass, field
18
+ from typing import List, Optional, Dict, Any, Set
19
+ from contextlib import asynccontextmanager
20
+
21
+ import httpx
22
+
23
+ from app.core.config import settings
24
+ from app.utils.retry import llm_retry # 复用已有的重试装饰器
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ============================================================
30
+ # 数据模型
31
+ # ============================================================
32
+
33
+ @dataclass
34
+ class GitHubFile:
35
+ """GitHub 文件信息"""
36
+ path: str
37
+ type: str # "blob" | "tree"
38
+ size: int = 0
39
+ sha: str = ""
40
+
41
+ @property
42
+ def is_file(self) -> bool:
43
+ return self.type == "blob"
44
+
45
+ @property
46
+ def is_directory(self) -> bool:
47
+ return self.type == "tree"
48
+
49
+
50
+ @dataclass
51
+ class GitHubRepo:
52
+ """GitHub 仓库信息"""
53
+ owner: str
54
+ name: str
55
+ default_branch: str = "main"
56
+ description: str = ""
57
+ stars: int = 0
58
+
59
+ @property
60
+ def full_name(self) -> str:
61
+ return f"{self.owner}/{self.name}"
62
+
63
+
64
+ @dataclass
65
+ class FileFilter:
66
+ """文件过滤配置"""
67
+ ignored_extensions: Set[str] = field(default_factory=lambda: {
68
+ '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.mp4', '.webp',
69
+ '.pyc', '.pyo', '.lock', '.zip', '.tar', '.gz', '.pdf', '.woff', '.woff2',
70
+ '.DS_Store', '.gitignore', '.gitattributes', '.editorconfig'
71
+ })
72
+
73
+ ignored_directories: Set[str] = field(default_factory=lambda: {
74
+ '.git', '.github', '.vscode', '.idea', '__pycache__',
75
+ 'node_modules', 'venv', 'env', '.env', 'build', 'dist',
76
+ 'site-packages', 'migrations', '.next', '.nuxt', 'coverage',
77
+ 'vendor', 'target', 'out', 'bin', 'obj'
78
+ })
79
+
80
+ max_file_size: int = 500_000 # 500KB
81
+
82
+ def should_include(self, file: GitHubFile) -> bool:
83
+ """判断文件是否应该被包含"""
84
+ if not file.is_file:
85
+ return False
86
+
87
+ # 检查目录
88
+ path_parts = file.path.split("/")
89
+ if any(part in self.ignored_directories for part in path_parts):
90
+ return False
91
+
92
+ # 检查扩展名
93
+ ext = os.path.splitext(file.path)[1].lower()
94
+ if ext in self.ignored_extensions:
95
+ return False
96
+
97
+ # 检查文件大小
98
+ if file.size > self.max_file_size:
99
+ return False
100
+
101
+ return True
102
+
103
+
104
+ # ============================================================
105
+ # 异常定义
106
+ # ============================================================
107
+
108
+ class GitHubError(Exception):
109
+ """GitHub API 错误基类"""
110
+ def __init__(self, message: str, status_code: int = 0):
111
+ self.message = message
112
+ self.status_code = status_code
113
+ super().__init__(message)
114
+
115
+
116
+ class GitHubAuthError(GitHubError):
117
+ """认证错误 (401)"""
118
+ pass
119
+
120
+
121
+ class GitHubRateLimitError(GitHubError):
122
+ """速率限制错误 (403)"""
123
+ pass
124
+
125
+
126
+ class GitHubNotFoundError(GitHubError):
127
+ """资源不存在 (404)"""
128
+ pass
129
+
130
+
131
+ # ============================================================
132
+ # GitHub 异步客户端
133
+ # ============================================================
134
+
135
+ class GitHubClient:
136
+ """
137
+ GitHub 异步 API 客户端
138
+
139
+ 使用示例:
140
+ ```python
141
+ async with GitHubClient() as client:
142
+ repo = await client.get_repo("owner", "repo")
143
+ files = await client.get_repo_tree(repo)
144
+ content = await client.get_file_content(repo, "README.md")
145
+ ```
146
+ """
147
+
148
+ BASE_URL = "https://api.github.com"
149
+
150
+ def __init__(
151
+ self,
152
+ token: Optional[str] = None,
153
+ timeout: float = 30.0,
154
+ max_concurrent_requests: int = 10
155
+ ):
156
+ self.token = token or settings.GITHUB_TOKEN
157
+ self.timeout = timeout
158
+ self._client: Optional[httpx.AsyncClient] = None
159
+ self._semaphore = asyncio.Semaphore(max_concurrent_requests)
160
+
161
+ @property
162
+ def _headers(self) -> Dict[str, str]:
163
+ """构建请求头"""
164
+ headers = {
165
+ "Accept": "application/vnd.github.v3+json",
166
+ "User-Agent": "GitHub-Agent-Demo/1.0"
167
+ }
168
+ if self.token:
169
+ headers["Authorization"] = f"Bearer {self.token}"
170
+ return headers
171
+
172
+ async def _ensure_client(self) -> httpx.AsyncClient:
173
+ """确保客户端已初始化"""
174
+ if self._client is None or self._client.is_closed:
175
+ self._client = httpx.AsyncClient(
176
+ base_url=self.BASE_URL,
177
+ headers=self._headers,
178
+ timeout=httpx.Timeout(self.timeout),
179
+ follow_redirects=True,
180
+ limits=httpx.Limits(
181
+ max_keepalive_connections=20,
182
+ max_connections=50
183
+ )
184
+ )
185
+ return self._client
186
+
187
+ async def close(self):
188
+ """关闭客户端连接"""
189
+ if self._client and not self._client.is_closed:
190
+ await self._client.aclose()
191
+ self._client = None
192
+
193
+ async def __aenter__(self):
194
+ await self._ensure_client()
195
+ return self
196
+
197
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
198
+ await self.close()
199
+
200
+ def _handle_error(self, response: httpx.Response, context: str = ""):
201
+ """统一错误处理"""
202
+ status = response.status_code
203
+
204
+ try:
205
+ data = response.json()
206
+ message = data.get("message", response.text)
207
+ except Exception:
208
+ message = response.text
209
+
210
+ error_msg = f"{context}: {message}" if context else message
211
+
212
+ if status == 401:
213
+ raise GitHubAuthError(
214
+ "GitHub Token 无效或已过期,请检查 .env 配置",
215
+ status
216
+ )
217
+ elif status == 403:
218
+ if "rate limit" in message.lower():
219
+ raise GitHubRateLimitError(
220
+ "GitHub API 请求已达上限,请稍后重试或添加 Token",
221
+ status
222
+ )
223
+ raise GitHubError(error_msg, status)
224
+ elif status == 404:
225
+ raise GitHubNotFoundError(error_msg, status)
226
+ else:
227
+ raise GitHubError(error_msg, status)
228
+
229
+ @llm_retry
230
+ async def _request(
231
+ self,
232
+ method: str,
233
+ endpoint: str,
234
+ **kwargs
235
+ ) -> Dict[str, Any]:
236
+ """
237
+ 发送 API 请求 (带重试)
238
+
239
+ Args:
240
+ method: HTTP 方法
241
+ endpoint: API 端点 (如 /repos/{owner}/{repo})
242
+ **kwargs: 传递给 httpx 的参数
243
+
244
+ Returns:
245
+ JSON 响应
246
+ """
247
+ async with self._semaphore:
248
+ client = await self._ensure_client()
249
+ response = await client.request(method, endpoint, **kwargs)
250
+
251
+ if response.status_code >= 400:
252
+ self._handle_error(response, endpoint)
253
+
254
+ return response.json()
255
+
256
+ async def _request_raw(
257
+ self,
258
+ method: str,
259
+ endpoint: str,
260
+ **kwargs
261
+ ) -> httpx.Response:
262
+ """发送请求并返回原始响应"""
263
+ async with self._semaphore:
264
+ client = await self._ensure_client()
265
+ return await client.request(method, endpoint, **kwargs)
266
+
267
+ # --------------------------------------------------------
268
+ # 仓库相关 API
269
+ # --------------------------------------------------------
270
+
271
+ async def get_repo(self, owner: str, name: str) -> GitHubRepo:
272
+ """获取仓库信息"""
273
+ data = await self._request("GET", f"/repos/{owner}/{name}")
274
+
275
+ return GitHubRepo(
276
+ owner=owner,
277
+ name=name,
278
+ default_branch=data.get("default_branch", "main"),
279
+ description=data.get("description", ""),
280
+ stars=data.get("stargazers_count", 0)
281
+ )
282
+
283
+ async def get_repo_tree(
284
+ self,
285
+ repo: GitHubRepo,
286
+ file_filter: Optional[FileFilter] = None
287
+ ) -> List[GitHubFile]:
288
+ """
289
+ 获取仓库文件树
290
+
291
+ Args:
292
+ repo: 仓库信息
293
+ file_filter: 文件过滤器 (默认使用标准过滤)
294
+
295
+ Returns:
296
+ 过滤后的文件列表
297
+ """
298
+ filter_config = file_filter or FileFilter()
299
+
300
+ data = await self._request(
301
+ "GET",
302
+ f"/repos/{repo.owner}/{repo.name}/git/trees/{repo.default_branch}",
303
+ params={"recursive": "1"}
304
+ )
305
+
306
+ files = []
307
+ for item in data.get("tree", []):
308
+ file = GitHubFile(
309
+ path=item["path"],
310
+ type=item["type"],
311
+ size=item.get("size", 0),
312
+ sha=item.get("sha", "")
313
+ )
314
+
315
+ if filter_config.should_include(file):
316
+ files.append(file)
317
+
318
+ logger.info(f"📂 仓库 {repo.full_name}: 共 {len(data.get('tree', []))} 项, 过滤后 {len(files)} 文件")
319
+ return files
320
+
321
+ # --------------------------------------------------------
322
+ # 文件内容 API
323
+ # --------------------------------------------------------
324
+
325
+ async def get_file_content(
326
+ self,
327
+ repo: GitHubRepo,
328
+ path: str
329
+ ) -> Optional[str]:
330
+ """
331
+ 获取单个文件内容
332
+
333
+ Args:
334
+ repo: 仓库信息
335
+ path: 文件路径
336
+
337
+ Returns:
338
+ 文件内容 (UTF-8 解码),失败返回 None
339
+ """
340
+ try:
341
+ data = await self._request(
342
+ "GET",
343
+ f"/repos/{repo.owner}/{repo.name}/contents/{path}",
344
+ params={"ref": repo.default_branch}
345
+ )
346
+
347
+ # 处理目录情况
348
+ if isinstance(data, list):
349
+ file_names = [f["name"] for f in data]
350
+ return f"Directory '{path}' contains:\n" + "\n".join(
351
+ f"- {name}" for name in file_names
352
+ )
353
+
354
+ # 解码文件内容
355
+ content = data.get("content", "")
356
+ encoding = data.get("encoding", "base64")
357
+
358
+ if encoding == "base64":
359
+ return base64.b64decode(content).decode("utf-8")
360
+
361
+ return content
362
+
363
+ except GitHubNotFoundError:
364
+ logger.warning(f"文件不存在: {path}")
365
+ return None
366
+ except UnicodeDecodeError:
367
+ logger.warning(f"文件无法解码为 UTF-8: {path}")
368
+ return None
369
+ except Exception as e:
370
+ logger.error(f"获取文件失败 {path}: {e}")
371
+ return None
372
+
373
+ async def get_files_content(
374
+ self,
375
+ repo: GitHubRepo,
376
+ paths: List[str],
377
+ show_progress: bool = False
378
+ ) -> Dict[str, Optional[str]]:
379
+ """
380
+ 批量获取文件内容 (并发优化)
381
+
382
+ Args:
383
+ repo: 仓库信息
384
+ paths: 文件路径列表
385
+ show_progress: 是否显示进度
386
+
387
+ Returns:
388
+ {path: content} 字典
389
+ """
390
+ if not paths:
391
+ return {}
392
+
393
+ if show_progress:
394
+ logger.info(f"📥 开始下载 {len(paths)} 个文件 (并发: {self._semaphore._value})")
395
+
396
+ # 并发获取所有文件
397
+ tasks = [
398
+ self.get_file_content(repo, path)
399
+ for path in paths
400
+ ]
401
+
402
+ results = await asyncio.gather(*tasks, return_exceptions=True)
403
+
404
+ # 组装结果
405
+ content_map = {}
406
+ success_count = 0
407
+
408
+ for path, result in zip(paths, results):
409
+ if isinstance(result, Exception):
410
+ logger.error(f"下载失败 {path}: {result}")
411
+ content_map[path] = None
412
+ else:
413
+ content_map[path] = result
414
+ if result is not None:
415
+ success_count += 1
416
+
417
+ if show_progress:
418
+ logger.info(f"✅ 文件下载完成: {success_count}/{len(paths)} 成功")
419
+
420
+ return content_map
421
+
422
+
423
+ # ============================================================
424
+ # 全局单例管理
425
+ # ============================================================
426
+
427
+ _github_client: Optional[GitHubClient] = None
428
+
429
+
430
+ def get_github_client() -> GitHubClient:
431
+ """获取 GitHub 客户端单例"""
432
+ global _github_client
433
+ if _github_client is None:
434
+ _github_client = GitHubClient()
435
+ return _github_client
436
+
437
+
438
+ async def close_github_client():
439
+ """关闭全局客户端 (应用关闭时调用)"""
440
+ global _github_client
441
+ if _github_client:
442
+ await _github_client.close()
443
+ _github_client = None
444
+
445
+
446
+ # ============================================================
447
+ # 便捷函数 (兼容旧接口)
448
+ # ============================================================
449
+
450
+ def parse_repo_url(url: str) -> Optional[tuple[str, str]]:
451
+ """
452
+ 解析 GitHub URL
453
+
454
+ Args:
455
+ url: GitHub 仓库 URL
456
+
457
+ Returns:
458
+ (owner, repo) 元组,无效返回 None
459
+ """
460
+ if url.endswith(".git"):
461
+ url = url[:-4]
462
+
463
+ # 支持多种格式
464
+ # https://github.com/owner/repo
465
+ # github.com/owner/repo
466
+ # owner/repo
467
+
468
+ parts = url.replace("https://", "").replace("http://", "").split("/")
469
+
470
+ if "github.com" in parts:
471
+ idx = parts.index("github.com")
472
+ if len(parts) > idx + 2:
473
+ return (parts[idx + 1], parts[idx + 2])
474
+ elif len(parts) == 2:
475
+ # 直接是 owner/repo 格式
476
+ return (parts[0], parts[1])
477
+
478
+ return None
app/utils/llm_client.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/utils/llm_client.py
2
+ """
3
+ 统一 LLM 客户端入口
4
+
5
+ 支持多个 LLM 供应商,通过 LLM_PROVIDER 环境变量切换:
6
+ - openai: OpenAI (GPT-4, GPT-4o 等)
7
+ - deepseek: DeepSeek (deepseek-chat, deepseek-coder 等)
8
+ - anthropic: Anthropic (Claude 3.5, Claude 3 等)
9
+ - gemini: Google Gemini (gemini-1.5-pro 等)
10
+
11
+ 使用方式 (与原来完全兼容):
12
+ from app.utils.llm_client import client
13
+
14
+ response = await client.chat.completions.create(
15
+ model=settings.default_model_name,
16
+ messages=[{"role": "user", "content": "Hello"}],
17
+ stream=True
18
+ )
19
+ """
20
+
21
+ from app.core.config import settings
22
+ from app.utils.llm_providers import LLMFactory, BaseLLMProvider
23
+ from typing import Optional
24
+
25
+ # 全局客户端实例
26
+ client: Optional[BaseLLMProvider] = None
27
+
28
+ def _initialize_client() -> Optional[BaseLLMProvider]:
29
+ """
30
+ 初始化 LLM 客户端
31
+
32
+ 根据配置的 LLM_PROVIDER 创建对应的客户端实例。
33
+ """
34
+ provider = settings.LLM_PROVIDER.lower()
35
+ api_key = settings.current_api_key
36
+ base_url = settings.current_base_url
37
+ model_name = settings.default_model_name
38
+
39
+ if not api_key:
40
+ print(f"❌ 未找到 {provider.upper()}_API_KEY")
41
+ return None
42
+
43
+ try:
44
+ return LLMFactory.create(
45
+ provider=provider,
46
+ api_key=api_key,
47
+ model_name=model_name,
48
+ base_url=base_url,
49
+ temperature=settings.LLM_TEMPERATURE,
50
+ max_tokens=settings.LLM_MAX_TOKENS,
51
+ timeout=settings.LLM_TIMEOUT,
52
+ )
53
+ except Exception as e:
54
+ print(f"❌ LLM Client 初始化失败: {e}")
55
+ return None
56
+
57
+
58
+ def get_client() -> Optional[BaseLLMProvider]:
59
+ """
60
+ 获取 LLM 客户端实例
61
+
62
+ 如果客户端尚未初始化,会自动初始化。
63
+ """
64
+ global client
65
+ if client is None:
66
+ client = _initialize_client()
67
+ return client
68
+
69
+
70
+ def reinitialize_client(
71
+ provider: str = None,
72
+ api_key: str = None,
73
+ model_name: str = None,
74
+ base_url: str = None,
75
+ ) -> Optional[BaseLLMProvider]:
76
+ """
77
+ 重新初始化客户端
78
+
79
+ 用于运行时切换 LLM 供应商或模型。
80
+
81
+ Args:
82
+ provider: 新的供应商 (可选)
83
+ api_key: 新的 API Key (可选)
84
+ model_name: 新的模型名称 (可选)
85
+ base_url: 新的 Base URL (可选)
86
+ """
87
+ global client
88
+
89
+ _provider = provider or settings.LLM_PROVIDER
90
+ _api_key = api_key or settings.current_api_key
91
+ _model_name = model_name or settings.default_model_name
92
+ _base_url = base_url or settings.current_base_url
93
+
94
+ try:
95
+ client = LLMFactory.create(
96
+ provider=_provider,
97
+ api_key=_api_key,
98
+ model_name=_model_name,
99
+ base_url=_base_url,
100
+ )
101
+ return client
102
+ except Exception as e:
103
+ print(f"❌ 重新初始化失败: {e}")
104
+ return None
105
+
106
+
107
+ # 自动初始化客户端
108
+ client = _initialize_client()
app/utils/llm_providers/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/utils/llm_providers/__init__.py
2
+ """
3
+ 多 LLM 供应商支持模块
4
+
5
+ 支持的供应商:
6
+ - OpenAI (GPT-4, GPT-4o, GPT-3.5-turbo 等)
7
+ - DeepSeek (deepseek-chat, deepseek-coder 等)
8
+ - Anthropic (Claude 3.5, Claude 3 等)
9
+ - Google Gemini (gemini-pro, gemini-1.5-pro 等)
10
+ """
11
+
12
+ from .base import BaseLLMProvider, LLMResponse, LLMConfig
13
+ from .openai_provider import OpenAIProvider
14
+ from .deepseek_provider import DeepSeekProvider
15
+ from .anthropic_provider import AnthropicProvider
16
+ from .gemini_provider import GeminiProvider
17
+ from .factory import LLMFactory, get_llm_client
18
+
19
+ __all__ = [
20
+ "BaseLLMProvider",
21
+ "LLMResponse",
22
+ "LLMConfig",
23
+ "OpenAIProvider",
24
+ "DeepSeekProvider",
25
+ "AnthropicProvider",
26
+ "GeminiProvider",
27
+ "LLMFactory",
28
+ "get_llm_client",
29
+ ]
app/utils/llm_providers/anthropic_provider.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/utils/llm_providers/anthropic_provider.py
2
+ """
3
+ Anthropic (Claude) LLM 提供商实现
4
+
5
+ 支持模型: claude-3-5-sonnet, claude-3-opus, claude-3-haiku 等
6
+ """
7
+
8
+ import uuid
9
+ import time
10
+ from typing import List, AsyncIterator
11
+
12
+ from .base import (
13
+ BaseLLMProvider,
14
+ LLMConfig,
15
+ LLMMessage,
16
+ LLMResponse,
17
+ LLMChoice,
18
+ LLMUsage,
19
+ LLMProviderType
20
+ )
21
+
22
+
23
+ class AnthropicProvider(BaseLLMProvider):
24
+ """
25
+ Anthropic (Claude) API 提供商
26
+
27
+ 注意: Anthropic 的消息格式与 OpenAI 略有不同:
28
+ - system 消息需要单独传递
29
+ - messages 只包含 user/assistant 轮次
30
+ """
31
+
32
+ def __init__(self, config: LLMConfig):
33
+ super().__init__(config)
34
+ try:
35
+ from anthropic import AsyncAnthropic
36
+ self._client = AsyncAnthropic(
37
+ api_key=config.api_key,
38
+ timeout=config.timeout
39
+ )
40
+ self._available = True
41
+ except ImportError:
42
+ print("⚠️ anthropic 包未安装,请运行: pip install anthropic")
43
+ self._client = None
44
+ self._available = False
45
+
46
+ def _extract_system_message(self, messages: List[LLMMessage]) -> tuple:
47
+ """
48
+ 提取 system 消息
49
+
50
+ Anthropic 需要将 system 消息单独传递,
51
+ 不能放在 messages 列表中。
52
+
53
+ Returns:
54
+ (system_prompt, filtered_messages)
55
+ """
56
+ system_prompt = None
57
+ filtered_messages = []
58
+
59
+ for msg in messages:
60
+ if msg.role == "system":
61
+ system_prompt = msg.content
62
+ else:
63
+ filtered_messages.append(msg)
64
+
65
+ return system_prompt, filtered_messages
66
+
67
+ async def chat_completions_create(
68
+ self,
69
+ messages: List[LLMMessage],
70
+ model: str,
71
+ temperature: float,
72
+ max_tokens: int,
73
+ timeout: int,
74
+ **kwargs
75
+ ) -> LLMResponse:
76
+ """非流式请求"""
77
+ if not self._available:
78
+ raise RuntimeError("Anthropic client not available. Please install: pip install anthropic")
79
+
80
+ system_prompt, filtered_messages = self._extract_system_message(messages)
81
+
82
+ # 转换消息格式
83
+ api_messages = [
84
+ {"role": m.role, "content": m.content}
85
+ for m in filtered_messages
86
+ ]
87
+
88
+ # 构建请求参数
89
+ request_params = {
90
+ "model": model,
91
+ "messages": api_messages,
92
+ "temperature": temperature,
93
+ "max_tokens": max_tokens,
94
+ }
95
+
96
+ if system_prompt:
97
+ request_params["system"] = system_prompt
98
+
99
+ response = await self._client.messages.create(**request_params)
100
+
101
+ # 转换为统一格式
102
+ content = ""
103
+ if response.content:
104
+ # Anthropic 的 content 是一个 list
105
+ for block in response.content:
106
+ if hasattr(block, 'text'):
107
+ content += block.text
108
+
109
+ choices = [
110
+ LLMChoice(
111
+ index=0,
112
+ message=LLMMessage(role="assistant", content=content),
113
+ finish_reason=response.stop_reason
114
+ )
115
+ ]
116
+
117
+ usage = LLMUsage(
118
+ prompt_tokens=response.usage.input_tokens,
119
+ completion_tokens=response.usage.output_tokens,
120
+ total_tokens=response.usage.input_tokens + response.usage.output_tokens
121
+ )
122
+
123
+ return LLMResponse(
124
+ id=response.id,
125
+ model=response.model,
126
+ choices=choices,
127
+ usage=usage,
128
+ created=int(time.time())
129
+ )
130
+
131
+ async def chat_completions_create_stream(
132
+ self,
133
+ messages: List[LLMMessage],
134
+ model: str,
135
+ temperature: float,
136
+ max_tokens: int,
137
+ timeout: int,
138
+ **kwargs
139
+ ) -> AsyncIterator[LLMResponse]:
140
+ """流式请求"""
141
+ if not self._available:
142
+ raise RuntimeError("Anthropic client not available. Please install: pip install anthropic")
143
+
144
+ system_prompt, filtered_messages = self._extract_system_message(messages)
145
+
146
+ api_messages = [
147
+ {"role": m.role, "content": m.content}
148
+ for m in filtered_messages
149
+ ]
150
+
151
+ request_params = {
152
+ "model": model,
153
+ "messages": api_messages,
154
+ "temperature": temperature,
155
+ "max_tokens": max_tokens,
156
+ }
157
+
158
+ if system_prompt:
159
+ request_params["system"] = system_prompt
160
+
161
+ response_id = f"msg_{uuid.uuid4().hex[:24]}"
162
+
163
+ async with self._client.messages.stream(**request_params) as stream:
164
+ async for text in stream.text_stream:
165
+ choices = [
166
+ LLMChoice(
167
+ index=0,
168
+ delta=LLMMessage(role="assistant", content=text),
169
+ finish_reason=None
170
+ )
171
+ ]
172
+ yield LLMResponse(
173
+ id=response_id,
174
+ model=model,
175
+ choices=choices,
176
+ created=int(time.time())
177
+ )
178
+
179
+ def validate_connection(self) -> bool:
180
+ """验证连接"""
181
+ return self._available and bool(self.config.api_key)
182
+
183
+
184
+ def create_anthropic_provider(
185
+ api_key: str,
186
+ model_name: str = "claude-3-5-sonnet-20241022",
187
+ **kwargs
188
+ ) -> AnthropicProvider:
189
+ """工厂函数:创建 Anthropic 提供商"""
190
+ config = LLMConfig(
191
+ provider=LLMProviderType.ANTHROPIC,
192
+ api_key=api_key,
193
+ model_name=model_name,
194
+ **kwargs
195
+ )
196
+ return AnthropicProvider(config)
app/utils/llm_providers/base.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/utils/llm_providers/base.py
2
+ """
3
+ LLM 提供商基类定义
4
+
5
+ 定义统一的接口规范,所有供应商实现都必须遵循此规范。
6
+ 采用适配器模式,将不同供应商的 API 统一为 OpenAI 兼容格式。
7
+ """
8
+
9
+ import logging
10
+ from abc import ABC, abstractmethod
11
+ from dataclasses import dataclass, field
12
+ from typing import List, Dict, Any, Optional, AsyncIterator, Union
13
+ from enum import Enum
14
+
15
+ from app.utils.retry import llm_retry, is_retryable_error
16
+
17
+ # 配置日志
18
+ logger = logging.getLogger("llm_provider")
19
+
20
+
21
+ class LLMProviderType(str, Enum):
22
+ """支持的 LLM 供应商类型"""
23
+ OPENAI = "openai"
24
+ DEEPSEEK = "deepseek"
25
+ ANTHROPIC = "anthropic"
26
+ GEMINI = "gemini"
27
+
28
+
29
+ @dataclass
30
+ class LLMConfig:
31
+ """LLM 配置"""
32
+ provider: LLMProviderType
33
+ api_key: str
34
+ model_name: str
35
+ base_url: Optional[str] = None
36
+ temperature: float = 0.1
37
+ max_tokens: int = 4096
38
+ timeout: int = 600
39
+ extra_params: Dict[str, Any] = field(default_factory=dict)
40
+
41
+
42
+ @dataclass
43
+ class LLMMessage:
44
+ """消息格式 (兼容 OpenAI)"""
45
+ role: str # "system", "user", "assistant"
46
+ content: str
47
+
48
+
49
+ @dataclass
50
+ class LLMUsage:
51
+ """Token 使用量"""
52
+ prompt_tokens: int = 0
53
+ completion_tokens: int = 0
54
+ total_tokens: int = 0
55
+
56
+
57
+ @dataclass
58
+ class LLMChoice:
59
+ """响应选项 (兼容 OpenAI)"""
60
+ index: int
61
+ message: Optional[LLMMessage] = None
62
+ delta: Optional[LLMMessage] = None # 流式响应时使用
63
+ finish_reason: Optional[str] = None
64
+
65
+
66
+ @dataclass
67
+ class LLMResponse:
68
+ """
69
+ 统一的 LLM 响应格式
70
+
71
+ 设计为兼容 OpenAI 的 ChatCompletion 格式,
72
+ 使得现有代码无需大幅修改即可使用。
73
+ """
74
+ id: str
75
+ model: str
76
+ choices: List[LLMChoice]
77
+ usage: Optional[LLMUsage] = None
78
+ created: int = 0
79
+
80
+ @property
81
+ def content(self) -> str:
82
+ """便捷方法:获取第一个选项的内容"""
83
+ if self.choices and self.choices[0].message:
84
+ return self.choices[0].message.content
85
+ return ""
86
+
87
+
88
+ # 辅助类定义(在 BaseLLMProvider 外部,避免嵌套类问题)
89
+ class _CompletionsNamespace:
90
+ """模拟 client.chat.completions 命名空间"""
91
+ def __init__(self, provider: 'BaseLLMProvider'):
92
+ self._provider = provider
93
+
94
+ async def create(
95
+ self,
96
+ model: str = None,
97
+ messages: List[Dict[str, str]] = None,
98
+ temperature: float = None,
99
+ max_tokens: int = None,
100
+ stream: bool = False,
101
+ timeout: int = None,
102
+ **kwargs
103
+ ) -> Union[LLMResponse, AsyncIterator[LLMResponse]]:
104
+ """
105
+ 统一的 completions.create 接口
106
+
107
+ 兼容 OpenAI SDK 调用方式:
108
+ response = await client.chat.completions.create(
109
+ model="gpt-4",
110
+ messages=[{"role": "user", "content": "Hello"}],
111
+ stream=True
112
+ )
113
+
114
+ 内置重试机制:
115
+ - 自动重试网络错误、超时、速率限制
116
+ - 指数退避策略
117
+ - 最多重试 3 次
118
+ """
119
+ # 合并配置
120
+ _model = model or self._provider.config.model_name
121
+ _temperature = temperature if temperature is not None else self._provider.config.temperature
122
+ _max_tokens = max_tokens or self._provider.config.max_tokens
123
+ _timeout = timeout or self._provider.config.timeout
124
+
125
+ # 转换消息格式
126
+ _messages = [
127
+ LLMMessage(role=m["role"], content=m["content"])
128
+ for m in (messages or [])
129
+ ]
130
+
131
+ if stream:
132
+ # 流式请求: 返回带重试的异步生成器
133
+ return self._create_stream_with_retry(
134
+ messages=_messages,
135
+ model=_model,
136
+ temperature=_temperature,
137
+ max_tokens=_max_tokens,
138
+ timeout=_timeout,
139
+ **kwargs
140
+ )
141
+ else:
142
+ # 非流式请求: 使用 tenacity 重试
143
+ return await self._create_with_retry(
144
+ messages=_messages,
145
+ model=_model,
146
+ temperature=_temperature,
147
+ max_tokens=_max_tokens,
148
+ timeout=_timeout,
149
+ **kwargs
150
+ )
151
+
152
+ @llm_retry
153
+ async def _create_with_retry(
154
+ self,
155
+ messages: List[LLMMessage],
156
+ model: str,
157
+ temperature: float,
158
+ max_tokens: int,
159
+ timeout: int,
160
+ **kwargs
161
+ ) -> LLMResponse:
162
+ """带重试的非流式请求"""
163
+ logger.debug(f"🔄 LLM 请求: model={model}, messages_count={len(messages)}")
164
+ return await self._provider.chat_completions_create(
165
+ messages=messages,
166
+ model=model,
167
+ temperature=temperature,
168
+ max_tokens=max_tokens,
169
+ timeout=timeout,
170
+ **kwargs
171
+ )
172
+
173
+ async def _create_stream_with_retry(
174
+ self,
175
+ messages: List[LLMMessage],
176
+ model: str,
177
+ temperature: float,
178
+ max_tokens: int,
179
+ timeout: int,
180
+ max_retries: int = 3,
181
+ **kwargs
182
+ ) -> AsyncIterator[LLMResponse]:
183
+ """
184
+ 带重试的流式请求
185
+
186
+ 注意: 流式请求的重试策略与非流式不同
187
+ - 如果在获取流之前失败,可以重试
188
+ - 如果在流传输过程中失败,需要重新开始
189
+ """
190
+ last_error = None
191
+
192
+ for attempt in range(1, max_retries + 1):
193
+ try:
194
+ logger.debug(f"🔄 LLM 流式请求 (尝试 {attempt}/{max_retries}): model={model}")
195
+
196
+ # 获取流生成器
197
+ stream = self._provider.chat_completions_create_stream(
198
+ messages=messages,
199
+ model=model,
200
+ temperature=temperature,
201
+ max_tokens=max_tokens,
202
+ timeout=timeout,
203
+ **kwargs
204
+ )
205
+
206
+ # 迭代流并 yield
207
+ async for chunk in stream:
208
+ yield chunk
209
+
210
+ # 成功完成,退出重试循环
211
+ return
212
+
213
+ except Exception as e:
214
+ last_error = e
215
+ if is_retryable_error(e) and attempt < max_retries:
216
+ wait_time = min(2 ** attempt, 30) # 指数退避
217
+ logger.warning(
218
+ f"🔄 LLM 流式请求失败 (尝试 {attempt}/{max_retries}): "
219
+ f"{type(e).__name__}: {e}. 等待 {wait_time}s 后重试..."
220
+ )
221
+ import asyncio
222
+ await asyncio.sleep(wait_time)
223
+ else:
224
+ # 不可重试的错误或已达到最大重试次数
225
+ logger.error(f"❌ LLM 流式请求最终失败: {type(e).__name__}: {e}")
226
+ raise
227
+
228
+ # 如果走到这里,说明所有重试都失败了
229
+ if last_error:
230
+ raise last_error
231
+
232
+
233
+ class _ChatNamespace:
234
+ """模拟 client.chat 命名空间"""
235
+ def __init__(self, provider: 'BaseLLMProvider'):
236
+ self._provider = provider
237
+ self.completions = _CompletionsNamespace(provider)
238
+
239
+
240
+ class BaseLLMProvider(ABC):
241
+ """
242
+ LLM 提供商抽象基类
243
+
244
+ 所有供应商实现都需要继承此类并实现以下方法:
245
+ - chat_completions_create: 非流式请求
246
+ - chat_completions_create_stream: 流式请求
247
+
248
+ 为了兼容现有代码,提供一个模拟 OpenAI 客户端的 chat.completions 接口。
249
+ """
250
+
251
+ def __init__(self, config: LLMConfig):
252
+ self.config = config
253
+ self._client = None
254
+ # 模拟 OpenAI SDK 的接口结构
255
+ self.chat = _ChatNamespace(self)
256
+
257
+ @abstractmethod
258
+ async def chat_completions_create(
259
+ self,
260
+ messages: List[LLMMessage],
261
+ model: str,
262
+ temperature: float,
263
+ max_tokens: int,
264
+ timeout: int,
265
+ **kwargs
266
+ ) -> LLMResponse:
267
+ """
268
+ 非流式 Chat Completion 请求
269
+
270
+ Args:
271
+ messages: 消息列表
272
+ model: 模型名称
273
+ temperature: 温度参数
274
+ max_tokens: 最大 Token 数
275
+ timeout: 超时时间
276
+
277
+ Returns:
278
+ LLMResponse: 统一格式的响应
279
+ """
280
+ pass
281
+
282
+ @abstractmethod
283
+ async def chat_completions_create_stream(
284
+ self,
285
+ messages: List[LLMMessage],
286
+ model: str,
287
+ temperature: float,
288
+ max_tokens: int,
289
+ timeout: int,
290
+ **kwargs
291
+ ) -> AsyncIterator[LLMResponse]:
292
+ """
293
+ 流式 Chat Completion 请求
294
+
295
+ Args:
296
+ messages: 消息列表
297
+ model: 模型名称
298
+ temperature: 温度参数
299
+ max_tokens: 最大 Token 数
300
+ timeout: 超时时间
301
+
302
+ Yields:
303
+ LLMResponse: 流式响应块
304
+ """
305
+ pass
306
+
307
+ @abstractmethod
308
+ def validate_connection(self) -> bool:
309
+ """验证连接是否正常"""
310
+ pass
311
+
312
+ @property
313
+ def provider_name(self) -> str:
314
+ """获取供应商名称"""
315
+ return self.config.provider.value
316
+
317
+ @property
318
+ def model_name(self) -> str:
319
+ """获取当前模型名称"""
320
+ return self.config.model_name
app/utils/llm_providers/deepseek_provider.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/utils/llm_providers/deepseek_provider.py
2
+ """
3
+ DeepSeek LLM 提供商实现
4
+
5
+ DeepSeek API 兼容 OpenAI 协议,因此直接复用 OpenAI SDK。
6
+ 支持模型: deepseek-chat, deepseek-coder, deepseek-reasoner 等
7
+ """
8
+
9
+ from typing import List, AsyncIterator
10
+ from openai import AsyncOpenAI
11
+
12
+ from .base import (
13
+ BaseLLMProvider,
14
+ LLMConfig,
15
+ LLMMessage,
16
+ LLMResponse,
17
+ LLMChoice,
18
+ LLMUsage,
19
+ LLMProviderType
20
+ )
21
+
22
+
23
+ # DeepSeek 默认 API 端点
24
+ DEEPSEEK_DEFAULT_BASE_URL = "https://api.deepseek.com"
25
+
26
+
27
+ class DeepSeekProvider(BaseLLMProvider):
28
+ """
29
+ DeepSeek API 提供商
30
+
31
+ DeepSeek 使用 OpenAI 兼容协议,因此可以直接使用 OpenAI SDK。
32
+ """
33
+
34
+ def __init__(self, config: LLMConfig):
35
+ super().__init__(config)
36
+ # 确保使用正确的 base_url
37
+ base_url = config.base_url or DEEPSEEK_DEFAULT_BASE_URL
38
+ self._client = AsyncOpenAI(
39
+ api_key=config.api_key,
40
+ base_url=base_url,
41
+ timeout=config.timeout
42
+ )
43
+
44
+ async def chat_completions_create(
45
+ self,
46
+ messages: List[LLMMessage],
47
+ model: str,
48
+ temperature: float,
49
+ max_tokens: int,
50
+ timeout: int,
51
+ **kwargs
52
+ ) -> LLMResponse:
53
+ """非流式请求"""
54
+ api_messages = [
55
+ {"role": m.role, "content": m.content}
56
+ for m in messages
57
+ ]
58
+
59
+ response = await self._client.chat.completions.create(
60
+ model=model,
61
+ messages=api_messages,
62
+ temperature=temperature,
63
+ max_tokens=max_tokens,
64
+ timeout=timeout,
65
+ **kwargs
66
+ )
67
+
68
+ choices = [
69
+ LLMChoice(
70
+ index=c.index,
71
+ message=LLMMessage(role=c.message.role, content=c.message.content),
72
+ finish_reason=c.finish_reason
73
+ )
74
+ for c in response.choices
75
+ ]
76
+
77
+ usage = None
78
+ if response.usage:
79
+ usage = LLMUsage(
80
+ prompt_tokens=response.usage.prompt_tokens,
81
+ completion_tokens=response.usage.completion_tokens,
82
+ total_tokens=response.usage.total_tokens
83
+ )
84
+
85
+ return LLMResponse(
86
+ id=response.id,
87
+ model=response.model,
88
+ choices=choices,
89
+ usage=usage,
90
+ created=response.created
91
+ )
92
+
93
+ async def chat_completions_create_stream(
94
+ self,
95
+ messages: List[LLMMessage],
96
+ model: str,
97
+ temperature: float,
98
+ max_tokens: int,
99
+ timeout: int,
100
+ **kwargs
101
+ ) -> AsyncIterator[LLMResponse]:
102
+ """流式请求"""
103
+ api_messages = [
104
+ {"role": m.role, "content": m.content}
105
+ for m in messages
106
+ ]
107
+
108
+ stream = await self._client.chat.completions.create(
109
+ model=model,
110
+ messages=api_messages,
111
+ temperature=temperature,
112
+ max_tokens=max_tokens,
113
+ timeout=timeout,
114
+ stream=True,
115
+ **kwargs
116
+ )
117
+
118
+ async for chunk in stream:
119
+ if chunk.choices:
120
+ delta_content = chunk.choices[0].delta.content or ""
121
+ choices = [
122
+ LLMChoice(
123
+ index=0,
124
+ delta=LLMMessage(role="assistant", content=delta_content),
125
+ finish_reason=chunk.choices[0].finish_reason
126
+ )
127
+ ]
128
+ yield LLMResponse(
129
+ id=chunk.id,
130
+ model=chunk.model,
131
+ choices=choices,
132
+ created=chunk.created
133
+ )
134
+
135
+ def validate_connection(self) -> bool:
136
+ """验证 API Key 有效性"""
137
+ return bool(self.config.api_key)
138
+
139
+
140
+ def create_deepseek_provider(
141
+ api_key: str,
142
+ model_name: str = "deepseek-chat",
143
+ base_url: str = None,
144
+ **kwargs
145
+ ) -> DeepSeekProvider:
146
+ """工厂函数:创建 DeepSeek 提供商"""
147
+ config = LLMConfig(
148
+ provider=LLMProviderType.DEEPSEEK,
149
+ api_key=api_key,
150
+ model_name=model_name,
151
+ base_url=base_url or DEEPSEEK_DEFAULT_BASE_URL,
152
+ **kwargs
153
+ )
154
+ return DeepSeekProvider(config)
app/utils/llm_providers/factory.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/utils/llm_providers/factory.py
2
+ """
3
+ LLM 工厂模块
4
+
5
+ 提供统一的 LLM 客户端创建接口,根据配置自动选择合适的供应商。
6
+ """
7
+
8
+ import os
9
+ from typing import Optional
10
+
11
+ from .base import BaseLLMProvider, LLMConfig, LLMProviderType
12
+ from .openai_provider import OpenAIProvider
13
+ from .deepseek_provider import DeepSeekProvider, DEEPSEEK_DEFAULT_BASE_URL
14
+ from .anthropic_provider import AnthropicProvider
15
+ from .gemini_provider import GeminiProvider
16
+
17
+
18
+ class LLMFactory:
19
+ """
20
+ LLM 客户端工厂
21
+
22
+ 根据提供商类型创建对应的客户端实例。
23
+ 支持从环境变量自动配置。
24
+ """
25
+
26
+ # 提供商类到枚举的映射
27
+ _providers = {
28
+ LLMProviderType.OPENAI: OpenAIProvider,
29
+ LLMProviderType.DEEPSEEK: DeepSeekProvider,
30
+ LLMProviderType.ANTHROPIC: AnthropicProvider,
31
+ LLMProviderType.GEMINI: GeminiProvider,
32
+ }
33
+
34
+ # 默认模型名称映射
35
+ _default_models = {
36
+ LLMProviderType.OPENAI: "gpt-4o-mini",
37
+ LLMProviderType.DEEPSEEK: "deepseek-chat",
38
+ LLMProviderType.ANTHROPIC: "claude-3-5-sonnet-20241022",
39
+ LLMProviderType.GEMINI: "gemini-1.5-flash",
40
+ }
41
+
42
+ # 默认 Base URL 映射
43
+ _default_base_urls = {
44
+ LLMProviderType.OPENAI: None, # 使用 SDK 默认
45
+ LLMProviderType.DEEPSEEK: DEEPSEEK_DEFAULT_BASE_URL,
46
+ LLMProviderType.ANTHROPIC: None,
47
+ LLMProviderType.GEMINI: None,
48
+ }
49
+
50
+ @classmethod
51
+ def create(
52
+ cls,
53
+ provider: str,
54
+ api_key: str,
55
+ model_name: str = None,
56
+ base_url: str = None,
57
+ **kwargs
58
+ ) -> Optional[BaseLLMProvider]:
59
+ """
60
+ 创建 LLM 客户端
61
+
62
+ Args:
63
+ provider: 提供商名称 ("openai", "deepseek", "anthropic", "gemini")
64
+ api_key: API Key
65
+ model_name: 模型名称 (可选,使用默认值)
66
+ base_url: 自定义 API 端点 (可选)
67
+ **kwargs: 其他配置参数
68
+
69
+ Returns:
70
+ BaseLLMProvider 实例,或 None (如果创建失败)
71
+ """
72
+ try:
73
+ # 解析提供商类型
74
+ provider_type = LLMProviderType(provider.lower())
75
+ except ValueError:
76
+ print(f"❌ 不支持的 LLM 提供商: {provider}")
77
+ print(f" 支持的提供商: {', '.join([p.value for p in LLMProviderType])}")
78
+ return None
79
+
80
+ if not api_key:
81
+ print(f"❌ 未提供 {provider} 的 API Key")
82
+ return None
83
+
84
+ # 获取提供商类
85
+ provider_class = cls._providers.get(provider_type)
86
+ if not provider_class:
87
+ print(f"❌ 提供商 {provider} 未实现")
88
+ return None
89
+
90
+ # 构建配置
91
+ config = LLMConfig(
92
+ provider=provider_type,
93
+ api_key=api_key,
94
+ model_name=model_name or cls._default_models.get(provider_type, "default"),
95
+ base_url=base_url or cls._default_base_urls.get(provider_type),
96
+ **kwargs
97
+ )
98
+
99
+ try:
100
+ client = provider_class(config)
101
+ if client.validate_connection():
102
+ print(f"✅ {provider.upper()} Client 初始化成功 (Model: {config.model_name})")
103
+ return client
104
+ else:
105
+ print(f"❌ {provider.upper()} Client 验证失败")
106
+ return None
107
+ except Exception as e:
108
+ print(f"❌ {provider.upper()} Client 初始化失败: {e}")
109
+ return None
110
+
111
+ @classmethod
112
+ def create_from_env(cls, provider: str = None) -> Optional[BaseLLMProvider]:
113
+ """
114
+ 从环境变量创建 LLM 客户端
115
+
116
+ 环境变量命名规范:
117
+ - LLM_PROVIDER: 提供商名称 (可被参数覆盖)
118
+ - {PROVIDER}_API_KEY: API Key (如 OPENAI_API_KEY, DEEPSEEK_API_KEY)
119
+ - {PROVIDER}_BASE_URL: 自定义端点 (可选)
120
+ - MODEL_NAME: 模型名称 (可选)
121
+
122
+ Args:
123
+ provider: 提供商名称 (可选,默认从 LLM_PROVIDER 环境变量读取)
124
+
125
+ Returns:
126
+ BaseLLMProvider 实例
127
+ """
128
+ # 确定提供商
129
+ _provider = provider or os.getenv("LLM_PROVIDER", "deepseek")
130
+ _provider = _provider.lower()
131
+
132
+ # 获取 API Key (支持多种命名方式)
133
+ key_env_names = [
134
+ f"{_provider.upper()}_API_KEY",
135
+ f"{_provider.upper()}API_KEY",
136
+ ]
137
+
138
+ api_key = None
139
+ for key_name in key_env_names:
140
+ api_key = os.getenv(key_name)
141
+ if api_key:
142
+ break
143
+
144
+ if not api_key:
145
+ print(f"❌ 未找到 {_provider.upper()} API Key")
146
+ print(f" 请设置环境变量: {key_env_names[0]}")
147
+ return None
148
+
149
+ # 获取可选配置
150
+ base_url = os.getenv(f"{_provider.upper()}_BASE_URL")
151
+ model_name = os.getenv("MODEL_NAME")
152
+
153
+ return cls.create(
154
+ provider=_provider,
155
+ api_key=api_key,
156
+ model_name=model_name,
157
+ base_url=base_url
158
+ )
159
+
160
+
161
+ def get_llm_client(provider: str = None) -> Optional[BaseLLMProvider]:
162
+ """
163
+ 便捷函数:获取 LLM 客户端
164
+
165
+ Args:
166
+ provider: 提供商名称 (可选)
167
+
168
+ Returns:
169
+ BaseLLMProvider 实例
170
+ """
171
+ return LLMFactory.create_from_env(provider)
app/utils/llm_providers/gemini_provider.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/utils/llm_providers/gemini_provider.py
2
+ """
3
+ Google Gemini LLM 提供商实现
4
+
5
+ 支持模型: gemini-1.5-pro, gemini-1.5-flash, gemini-pro 等
6
+ """
7
+
8
+ import uuid
9
+ import time
10
+ from typing import List, AsyncIterator
11
+
12
+ from .base import (
13
+ BaseLLMProvider,
14
+ LLMConfig,
15
+ LLMMessage,
16
+ LLMResponse,
17
+ LLMChoice,
18
+ LLMUsage,
19
+ LLMProviderType
20
+ )
21
+
22
+
23
+ class GeminiProvider(BaseLLMProvider):
24
+ """
25
+ Google Gemini API 提供商
26
+
27
+ 支持两种方式:
28
+ 1. 使用 google-generativeai SDK (原生)
29
+ 2. 使用 OpenAI 兼容接口 (通过 AI Studio 或 Vertex AI)
30
+ """
31
+
32
+ def __init__(self, config: LLMConfig):
33
+ super().__init__(config)
34
+ self._available = False
35
+ self._use_openai_compat = config.base_url is not None
36
+
37
+ if self._use_openai_compat:
38
+ # 使用 OpenAI 兼容模式 (推荐)
39
+ try:
40
+ from openai import AsyncOpenAI
41
+ self._client = AsyncOpenAI(
42
+ api_key=config.api_key,
43
+ base_url=config.base_url,
44
+ timeout=config.timeout
45
+ )
46
+ self._available = True
47
+ print(f"✅ Gemini Provider (OpenAI Compatible) initialized")
48
+ except ImportError:
49
+ print("⚠️ openai 包未安装")
50
+ else:
51
+ # 使用 Google AI SDK (原生模式)
52
+ try:
53
+ import google.generativeai as genai
54
+ genai.configure(api_key=config.api_key)
55
+ self._genai = genai
56
+ self._model = genai.GenerativeModel(config.model_name)
57
+ self._available = True
58
+ print(f"✅ Gemini Provider (Native SDK) initialized")
59
+ except ImportError:
60
+ print("⚠️ google-generativeai 包未安装,请运行: pip install google-generativeai")
61
+ self._genai = None
62
+ self._model = None
63
+
64
+ def _convert_messages_to_gemini(self, messages: List[LLMMessage]) -> tuple:
65
+ """
66
+ 转换消息格式为 Gemini 格式
67
+
68
+ Gemini 的消息格式:
69
+ - 不支持 system 角色,需要将其合并到第一条 user 消息
70
+ - role: "user" | "model" (不是 "assistant")
71
+
72
+ Returns:
73
+ (history, current_message)
74
+ """
75
+ system_content = ""
76
+ converted = []
77
+
78
+ for msg in messages:
79
+ if msg.role == "system":
80
+ system_content = msg.content + "\n\n"
81
+ elif msg.role == "assistant":
82
+ converted.append({"role": "model", "parts": [msg.content]})
83
+ else: # user
84
+ content = msg.content
85
+ if system_content and len(converted) == 0:
86
+ content = system_content + content
87
+ system_content = ""
88
+ converted.append({"role": "user", "parts": [content]})
89
+
90
+ if not converted:
91
+ return [], ""
92
+
93
+ # 最后一条作为当前消息
94
+ if len(converted) == 1:
95
+ return [], converted[0]["parts"][0]
96
+ else:
97
+ return converted[:-1], converted[-1]["parts"][0]
98
+
99
+ async def chat_completions_create(
100
+ self,
101
+ messages: List[LLMMessage],
102
+ model: str,
103
+ temperature: float,
104
+ max_tokens: int,
105
+ timeout: int,
106
+ **kwargs
107
+ ) -> LLMResponse:
108
+ """非流式请求"""
109
+ if not self._available:
110
+ raise RuntimeError("Gemini client not available")
111
+
112
+ if self._use_openai_compat:
113
+ # OpenAI 兼容模式
114
+ api_messages = [
115
+ {"role": m.role, "content": m.content}
116
+ for m in messages
117
+ ]
118
+
119
+ response = await self._client.chat.completions.create(
120
+ model=model,
121
+ messages=api_messages,
122
+ temperature=temperature,
123
+ max_tokens=max_tokens,
124
+ timeout=timeout,
125
+ **kwargs
126
+ )
127
+
128
+ choices = [
129
+ LLMChoice(
130
+ index=c.index,
131
+ message=LLMMessage(role=c.message.role, content=c.message.content),
132
+ finish_reason=c.finish_reason
133
+ )
134
+ for c in response.choices
135
+ ]
136
+
137
+ usage = None
138
+ if response.usage:
139
+ usage = LLMUsage(
140
+ prompt_tokens=response.usage.prompt_tokens,
141
+ completion_tokens=response.usage.completion_tokens,
142
+ total_tokens=response.usage.total_tokens
143
+ )
144
+
145
+ return LLMResponse(
146
+ id=response.id,
147
+ model=response.model,
148
+ choices=choices,
149
+ usage=usage,
150
+ created=response.created
151
+ )
152
+ else:
153
+ # Native SDK 模式
154
+ history, current_msg = self._convert_messages_to_gemini(messages)
155
+
156
+ generation_config = {
157
+ "temperature": temperature,
158
+ "max_output_tokens": max_tokens,
159
+ }
160
+
161
+ chat = self._model.start_chat(history=history)
162
+ response = await chat.send_message_async(
163
+ current_msg,
164
+ generation_config=generation_config
165
+ )
166
+
167
+ content = response.text if response.text else ""
168
+
169
+ choices = [
170
+ LLMChoice(
171
+ index=0,
172
+ message=LLMMessage(role="assistant", content=content),
173
+ finish_reason="stop"
174
+ )
175
+ ]
176
+
177
+ # Gemini 原生 SDK 的 token 统计
178
+ usage = None
179
+ if hasattr(response, 'usage_metadata') and response.usage_metadata:
180
+ usage = LLMUsage(
181
+ prompt_tokens=getattr(response.usage_metadata, 'prompt_token_count', 0),
182
+ completion_tokens=getattr(response.usage_metadata, 'candidates_token_count', 0),
183
+ total_tokens=getattr(response.usage_metadata, 'total_token_count', 0)
184
+ )
185
+
186
+ return LLMResponse(
187
+ id=f"gemini-{uuid.uuid4().hex[:12]}",
188
+ model=model,
189
+ choices=choices,
190
+ usage=usage,
191
+ created=int(time.time())
192
+ )
193
+
194
+ async def chat_completions_create_stream(
195
+ self,
196
+ messages: List[LLMMessage],
197
+ model: str,
198
+ temperature: float,
199
+ max_tokens: int,
200
+ timeout: int,
201
+ **kwargs
202
+ ) -> AsyncIterator[LLMResponse]:
203
+ """流式请求"""
204
+ if not self._available:
205
+ raise RuntimeError("Gemini client not available")
206
+
207
+ if self._use_openai_compat:
208
+ # OpenAI 兼容模式
209
+ api_messages = [
210
+ {"role": m.role, "content": m.content}
211
+ for m in messages
212
+ ]
213
+
214
+ stream = await self._client.chat.completions.create(
215
+ model=model,
216
+ messages=api_messages,
217
+ temperature=temperature,
218
+ max_tokens=max_tokens,
219
+ timeout=timeout,
220
+ stream=True,
221
+ **kwargs
222
+ )
223
+
224
+ async for chunk in stream:
225
+ if chunk.choices:
226
+ delta_content = chunk.choices[0].delta.content or ""
227
+ choices = [
228
+ LLMChoice(
229
+ index=0,
230
+ delta=LLMMessage(role="assistant", content=delta_content),
231
+ finish_reason=chunk.choices[0].finish_reason
232
+ )
233
+ ]
234
+ yield LLMResponse(
235
+ id=chunk.id,
236
+ model=chunk.model,
237
+ choices=choices,
238
+ created=chunk.created
239
+ )
240
+ else:
241
+ # Native SDK 流式
242
+ history, current_msg = self._convert_messages_to_gemini(messages)
243
+
244
+ generation_config = {
245
+ "temperature": temperature,
246
+ "max_output_tokens": max_tokens,
247
+ }
248
+
249
+ chat = self._model.start_chat(history=history)
250
+ response = await chat.send_message_async(
251
+ current_msg,
252
+ generation_config=generation_config,
253
+ stream=True
254
+ )
255
+
256
+ response_id = f"gemini-{uuid.uuid4().hex[:12]}"
257
+
258
+ async for chunk in response:
259
+ if chunk.text:
260
+ choices = [
261
+ LLMChoice(
262
+ index=0,
263
+ delta=LLMMessage(role="assistant", content=chunk.text),
264
+ finish_reason=None
265
+ )
266
+ ]
267
+ yield LLMResponse(
268
+ id=response_id,
269
+ model=model,
270
+ choices=choices,
271
+ created=int(time.time())
272
+ )
273
+
274
+ def validate_connection(self) -> bool:
275
+ """验证连接"""
276
+ return self._available and bool(self.config.api_key)
277
+
278
+
279
+ def create_gemini_provider(
280
+ api_key: str,
281
+ model_name: str = "gemini-1.5-flash",
282
+ base_url: str = None,
283
+ **kwargs
284
+ ) -> GeminiProvider:
285
+ """
286
+ 工厂函数:创建 Gemini 提供商
287
+
288
+ Args:
289
+ api_key: Google AI API Key
290
+ model_name: 模型名称
291
+ base_url: OpenAI 兼容端点 (可选)
292
+ 如果不提供,则使用原生 SDK
293
+ """
294
+ config = LLMConfig(
295
+ provider=LLMProviderType.GEMINI,
296
+ api_key=api_key,
297
+ model_name=model_name,
298
+ base_url=base_url,
299
+ **kwargs
300
+ )
301
+ return GeminiProvider(config)
app/utils/llm_providers/openai_provider.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/utils/llm_providers/openai_provider.py
2
+ """
3
+ OpenAI LLM 提供商实现
4
+
5
+ 支持模型: GPT-4, GPT-4o, GPT-4o-mini, GPT-3.5-turbo 等
6
+ """
7
+
8
+ from typing import List, AsyncIterator
9
+ from openai import AsyncOpenAI
10
+
11
+ from .base import (
12
+ BaseLLMProvider,
13
+ LLMConfig,
14
+ LLMMessage,
15
+ LLMResponse,
16
+ LLMChoice,
17
+ LLMUsage,
18
+ LLMProviderType
19
+ )
20
+
21
+
22
+ class OpenAIProvider(BaseLLMProvider):
23
+ """OpenAI API 提供商"""
24
+
25
+ def __init__(self, config: LLMConfig):
26
+ super().__init__(config)
27
+ self._client = AsyncOpenAI(
28
+ api_key=config.api_key,
29
+ base_url=config.base_url, # 可选自定义 base_url
30
+ timeout=config.timeout
31
+ )
32
+
33
+ async def chat_completions_create(
34
+ self,
35
+ messages: List[LLMMessage],
36
+ model: str,
37
+ temperature: float,
38
+ max_tokens: int,
39
+ timeout: int,
40
+ **kwargs
41
+ ) -> LLMResponse:
42
+ """非流式请求"""
43
+ # 转换消息格式
44
+ api_messages = [
45
+ {"role": m.role, "content": m.content}
46
+ for m in messages
47
+ ]
48
+
49
+ response = await self._client.chat.completions.create(
50
+ model=model,
51
+ messages=api_messages,
52
+ temperature=temperature,
53
+ max_tokens=max_tokens,
54
+ timeout=timeout,
55
+ **kwargs
56
+ )
57
+
58
+ # 转换为统一格式
59
+ choices = [
60
+ LLMChoice(
61
+ index=c.index,
62
+ message=LLMMessage(role=c.message.role, content=c.message.content),
63
+ finish_reason=c.finish_reason
64
+ )
65
+ for c in response.choices
66
+ ]
67
+
68
+ usage = None
69
+ if response.usage:
70
+ usage = LLMUsage(
71
+ prompt_tokens=response.usage.prompt_tokens,
72
+ completion_tokens=response.usage.completion_tokens,
73
+ total_tokens=response.usage.total_tokens
74
+ )
75
+
76
+ return LLMResponse(
77
+ id=response.id,
78
+ model=response.model,
79
+ choices=choices,
80
+ usage=usage,
81
+ created=response.created
82
+ )
83
+
84
+ async def chat_completions_create_stream(
85
+ self,
86
+ messages: List[LLMMessage],
87
+ model: str,
88
+ temperature: float,
89
+ max_tokens: int,
90
+ timeout: int,
91
+ **kwargs
92
+ ) -> AsyncIterator[LLMResponse]:
93
+ """流式请求"""
94
+ api_messages = [
95
+ {"role": m.role, "content": m.content}
96
+ for m in messages
97
+ ]
98
+
99
+ stream = await self._client.chat.completions.create(
100
+ model=model,
101
+ messages=api_messages,
102
+ temperature=temperature,
103
+ max_tokens=max_tokens,
104
+ timeout=timeout,
105
+ stream=True,
106
+ **kwargs
107
+ )
108
+
109
+ async for chunk in stream:
110
+ if chunk.choices:
111
+ delta_content = chunk.choices[0].delta.content or ""
112
+ choices = [
113
+ LLMChoice(
114
+ index=0,
115
+ delta=LLMMessage(role="assistant", content=delta_content),
116
+ finish_reason=chunk.choices[0].finish_reason
117
+ )
118
+ ]
119
+ yield LLMResponse(
120
+ id=chunk.id,
121
+ model=chunk.model,
122
+ choices=choices,
123
+ created=chunk.created
124
+ )
125
+
126
+ def validate_connection(self) -> bool:
127
+ """验证 API Key 有效性"""
128
+ return bool(self.config.api_key)
129
+
130
+
131
+ def create_openai_provider(
132
+ api_key: str,
133
+ model_name: str = "gpt-4o-mini",
134
+ base_url: str = None,
135
+ **kwargs
136
+ ) -> OpenAIProvider:
137
+ """工厂函数:创建 OpenAI 提供商"""
138
+ config = LLMConfig(
139
+ provider=LLMProviderType.OPENAI,
140
+ api_key=api_key,
141
+ model_name=model_name,
142
+ base_url=base_url,
143
+ **kwargs
144
+ )
145
+ return OpenAIProvider(config)
app/utils/repo_lock.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 仓库级分布式锁
4
+
5
+ 解决问题:
6
+ 1. 同一仓库的并发写入竞争 (两人同时输入同一 URL)
7
+ 2. 重新分析时的数据一致性 (用户 A 重分析,用户 B 同时查询)
8
+
9
+ 设计原则:
10
+ - 单进程: asyncio.Lock (内存锁)
11
+ - 多进程: 文件锁 (fcntl/msvcrt)
12
+ - 多节点: 可选 Redis 分布式锁 (生产环境)
13
+
14
+ 使用示例:
15
+ ```python
16
+ async with RepoLock.acquire(session_id):
17
+ # 独占访问该仓库的写操作
18
+ await vector_store.reset()
19
+ await vector_store.add_documents(docs)
20
+ ```
21
+ """
22
+
23
+ import asyncio
24
+ import logging
25
+ import os
26
+ import time
27
+ from abc import ABC, abstractmethod
28
+ from contextlib import asynccontextmanager
29
+ from dataclasses import dataclass
30
+ from pathlib import Path
31
+ from typing import Dict, Optional
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ # ============================================================
37
+ # 锁配置
38
+ # ============================================================
39
+
40
+ @dataclass
41
+ class LockConfig:
42
+ """锁配置"""
43
+ # 锁类型: "memory" | "file" | "redis"
44
+ backend: str = os.getenv("LOCK_BACKEND", "file")
45
+
46
+ # 文件锁目录
47
+ lock_dir: str = os.getenv("LOCK_DIR", "data/locks")
48
+
49
+ # Redis 配置 (可选)
50
+ redis_url: str = os.getenv("REDIS_URL", "redis://localhost:6379/0")
51
+
52
+ # 锁超时 (秒)
53
+ lock_timeout: float = float(os.getenv("LOCK_TIMEOUT", "300")) # 5分钟
54
+
55
+ # 等待超时 (秒)
56
+ acquire_timeout: float = float(os.getenv("LOCK_ACQUIRE_TIMEOUT", "60"))
57
+
58
+
59
+ # ============================================================
60
+ # 锁后端抽象
61
+ # ============================================================
62
+
63
+ class LockBackend(ABC):
64
+ """锁后端接口"""
65
+
66
+ @abstractmethod
67
+ async def acquire(self, key: str, timeout: float) -> bool:
68
+ """获取锁"""
69
+ pass
70
+
71
+ @abstractmethod
72
+ async def release(self, key: str) -> None:
73
+ """释放锁"""
74
+ pass
75
+
76
+ @abstractmethod
77
+ async def is_locked(self, key: str) -> bool:
78
+ """检查是否已锁定"""
79
+ pass
80
+
81
+
82
+ # ============================================================
83
+ # 内存锁 (单进程)
84
+ # ============================================================
85
+
86
+ class MemoryLockBackend(LockBackend):
87
+ """
88
+ 内存锁后端 (asyncio.Lock)
89
+
90
+ 适用于: 单 Worker 部署
91
+ """
92
+
93
+ def __init__(self):
94
+ self._locks: Dict[str, asyncio.Lock] = {}
95
+ self._meta_lock = asyncio.Lock()
96
+
97
+ async def _get_lock(self, key: str) -> asyncio.Lock:
98
+ async with self._meta_lock:
99
+ if key not in self._locks:
100
+ self._locks[key] = asyncio.Lock()
101
+ return self._locks[key]
102
+
103
+ async def acquire(self, key: str, timeout: float) -> bool:
104
+ lock = await self._get_lock(key)
105
+ try:
106
+ await asyncio.wait_for(lock.acquire(), timeout=timeout)
107
+ return True
108
+ except asyncio.TimeoutError:
109
+ return False
110
+
111
+ async def release(self, key: str) -> None:
112
+ if key in self._locks:
113
+ lock = self._locks[key]
114
+ if lock.locked():
115
+ lock.release()
116
+
117
+ async def is_locked(self, key: str) -> bool:
118
+ if key not in self._locks:
119
+ return False
120
+ return self._locks[key].locked()
121
+
122
+
123
+ # ============================================================
124
+ # 文件锁 (多进程,单节点)
125
+ # ============================================================
126
+
127
+ class FileLockBackend(LockBackend):
128
+ """
129
+ 文件锁后端
130
+
131
+ 适用于: 多 Worker 单节点部署 (Gunicorn + Qdrant Server)
132
+
133
+ 实现:
134
+ - Windows: msvcrt.locking
135
+ - Unix: fcntl.flock
136
+ """
137
+
138
+ def __init__(self, lock_dir: str):
139
+ self._lock_dir = Path(lock_dir)
140
+ self._lock_dir.mkdir(parents=True, exist_ok=True)
141
+ self._handles: Dict[str, object] = {}
142
+ self._memory_locks: Dict[str, asyncio.Lock] = {}
143
+ self._meta_lock = asyncio.Lock()
144
+
145
+ def _get_lock_path(self, key: str) -> Path:
146
+ # 清理 key,避免路径注入
147
+ safe_key = "".join(c if c.isalnum() or c in "_-" else "_" for c in key)
148
+ return self._lock_dir / f"{safe_key}.lock"
149
+
150
+ async def _get_memory_lock(self, key: str) -> asyncio.Lock:
151
+ """同进程内的内存锁,防止同一进程内多个协程竞争文件锁"""
152
+ async with self._meta_lock:
153
+ if key not in self._memory_locks:
154
+ self._memory_locks[key] = asyncio.Lock()
155
+ return self._memory_locks[key]
156
+
157
+ async def acquire(self, key: str, timeout: float) -> bool:
158
+ # 先获取内存锁
159
+ mem_lock = await self._get_memory_lock(key)
160
+ try:
161
+ await asyncio.wait_for(mem_lock.acquire(), timeout=timeout)
162
+ except asyncio.TimeoutError:
163
+ return False
164
+
165
+ # 再获取文件锁
166
+ lock_path = self._get_lock_path(key)
167
+ start_time = time.time()
168
+
169
+ while time.time() - start_time < timeout:
170
+ try:
171
+ # 尝试获取文件锁
172
+ handle = open(lock_path, 'w')
173
+
174
+ if os.name == 'nt':
175
+ # Windows
176
+ import msvcrt
177
+ msvcrt.locking(handle.fileno(), msvcrt.LK_NBLCK, 1)
178
+ else:
179
+ # Unix
180
+ import fcntl
181
+ fcntl.flock(handle.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
182
+
183
+ self._handles[key] = handle
184
+ logger.debug(f"🔒 文件锁获取成功: {key}")
185
+ return True
186
+
187
+ except (IOError, OSError):
188
+ # 锁被占用,等待后重试
189
+ if 'handle' in dir() and handle:
190
+ handle.close()
191
+ await asyncio.sleep(0.1)
192
+
193
+ # 超时,释放内存锁
194
+ mem_lock.release()
195
+ logger.warning(f"⏰ 文件锁获取超时: {key}")
196
+ return False
197
+
198
+ async def release(self, key: str) -> None:
199
+ if key in self._handles:
200
+ handle = self._handles.pop(key)
201
+ try:
202
+ if os.name == 'nt':
203
+ import msvcrt
204
+ try:
205
+ msvcrt.locking(handle.fileno(), msvcrt.LK_UNLCK, 1)
206
+ except:
207
+ pass
208
+ else:
209
+ import fcntl
210
+ fcntl.flock(handle.fileno(), fcntl.LOCK_UN)
211
+ handle.close()
212
+ except:
213
+ pass
214
+ logger.debug(f"🔓 文件锁已释放: {key}")
215
+
216
+ # 释放内存锁
217
+ if key in self._memory_locks:
218
+ lock = self._memory_locks[key]
219
+ if lock.locked():
220
+ lock.release()
221
+
222
+ async def is_locked(self, key: str) -> bool:
223
+ lock_path = self._get_lock_path(key)
224
+ if not lock_path.exists():
225
+ return False
226
+
227
+ try:
228
+ handle = open(lock_path, 'w')
229
+ if os.name == 'nt':
230
+ import msvcrt
231
+ msvcrt.locking(handle.fileno(), msvcrt.LK_NBLCK, 1)
232
+ msvcrt.locking(handle.fileno(), msvcrt.LK_UNLCK, 1)
233
+ else:
234
+ import fcntl
235
+ fcntl.flock(handle.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
236
+ fcntl.flock(handle.fileno(), fcntl.LOCK_UN)
237
+ handle.close()
238
+ return False
239
+ except (IOError, OSError):
240
+ return True
241
+
242
+
243
+ # ============================================================
244
+ # Redis 锁 (分布式,多节点)
245
+ # ============================================================
246
+
247
+ class RedisLockBackend(LockBackend):
248
+ """
249
+ Redis 分布式锁后端
250
+
251
+ 适用于: 多节点部署 (K8s + Redis)
252
+
253
+ 依赖: redis[hiredis]
254
+ """
255
+
256
+ def __init__(self, redis_url: str, lock_timeout: float):
257
+ self._redis_url = redis_url
258
+ self._lock_timeout = lock_timeout
259
+ self._client = None
260
+ self._locks: Dict[str, object] = {}
261
+
262
+ async def _get_client(self):
263
+ if self._client is None:
264
+ try:
265
+ import redis.asyncio as aioredis
266
+ self._client = await aioredis.from_url(self._redis_url)
267
+ except ImportError:
268
+ raise RuntimeError(
269
+ "Redis 锁需要安装 redis 包: pip install redis[hiredis]"
270
+ )
271
+ return self._client
272
+
273
+ async def acquire(self, key: str, timeout: float) -> bool:
274
+ client = await self._get_client()
275
+ lock_key = f"repo_lock:{key}"
276
+
277
+ start_time = time.time()
278
+ while time.time() - start_time < timeout:
279
+ # 尝试设置锁
280
+ acquired = await client.set(
281
+ lock_key,
282
+ "locked",
283
+ nx=True,
284
+ ex=int(self._lock_timeout)
285
+ )
286
+ if acquired:
287
+ logger.debug(f"🔒 Redis 锁获取成功: {key}")
288
+ return True
289
+ await asyncio.sleep(0.1)
290
+
291
+ logger.warning(f"⏰ Redis 锁获取超时: {key}")
292
+ return False
293
+
294
+ async def release(self, key: str) -> None:
295
+ client = await self._get_client()
296
+ lock_key = f"repo_lock:{key}"
297
+ await client.delete(lock_key)
298
+ logger.debug(f"🔓 Redis 锁已释放: {key}")
299
+
300
+ async def is_locked(self, key: str) -> bool:
301
+ client = await self._get_client()
302
+ lock_key = f"repo_lock:{key}"
303
+ return await client.exists(lock_key) > 0
304
+
305
+
306
+ # ============================================================
307
+ # 统一锁接口
308
+ # ============================================================
309
+
310
+ class RepoLock:
311
+ """
312
+ 仓库级锁 - 统一接口
313
+
314
+ 自动根据配置选择后端:
315
+ - memory: 单进程内存锁 (开发)
316
+ - file: 文件锁 (多进程单节点)
317
+ - redis: 分布式锁 (多节点)
318
+
319
+ 使用:
320
+ ```python
321
+ async with RepoLock.acquire(session_id):
322
+ # 独占写操作
323
+ await store.reset()
324
+ ```
325
+ """
326
+
327
+ _backend: Optional[LockBackend] = None
328
+ _config: Optional[LockConfig] = None
329
+
330
+ @classmethod
331
+ def _get_backend(cls) -> LockBackend:
332
+ if cls._backend is None:
333
+ cls._config = LockConfig()
334
+
335
+ if cls._config.backend == "redis":
336
+ cls._backend = RedisLockBackend(
337
+ cls._config.redis_url,
338
+ cls._config.lock_timeout
339
+ )
340
+ logger.info("🔐 使用 Redis 分布式锁")
341
+ elif cls._config.backend == "file":
342
+ cls._backend = FileLockBackend(cls._config.lock_dir)
343
+ logger.info(f"🔐 使用文件锁: {cls._config.lock_dir}")
344
+ else:
345
+ cls._backend = MemoryLockBackend()
346
+ logger.info("🔐 使用内存锁 (单进程)")
347
+
348
+ return cls._backend
349
+
350
+ @classmethod
351
+ @asynccontextmanager
352
+ async def acquire(cls, session_id: str, timeout: float = None):
353
+ """
354
+ 获取仓库写锁
355
+
356
+ Args:
357
+ session_id: 仓库的 session ID
358
+ timeout: 获取锁的超时时间 (默认从配置读取)
359
+
360
+ Raises:
361
+ TimeoutError: 获取锁超时
362
+ """
363
+ backend = cls._get_backend()
364
+ config = cls._config or LockConfig()
365
+ wait_timeout = timeout or config.acquire_timeout
366
+
367
+ acquired = await backend.acquire(session_id, wait_timeout)
368
+ if not acquired:
369
+ raise TimeoutError(f"无法获取仓库锁: {session_id} (等待 {wait_timeout}s)")
370
+
371
+ try:
372
+ yield
373
+ finally:
374
+ await backend.release(session_id)
375
+
376
+ @classmethod
377
+ async def is_locked(cls, session_id: str) -> bool:
378
+ """检查仓库是否被锁定"""
379
+ backend = cls._get_backend()
380
+ return await backend.is_locked(session_id)
381
+
382
+ @classmethod
383
+ async def try_acquire(cls, session_id: str, timeout: float = 0.1):
384
+ """
385
+ 尝试获取锁 (非阻塞)
386
+
387
+ 用于检测是否有其他用户正在分析同一仓库
388
+ """
389
+ backend = cls._get_backend()
390
+ return await backend.acquire(session_id, timeout)
app/utils/retry.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: app/utils/retry.py
2
+ """
3
+ LLM 调用重试机制
4
+
5
+ 使用 tenacity 库实现智能重试策略:
6
+ - 指数退避 (Exponential Backoff)
7
+ - 可重试异常识别
8
+ - 最大重试次数限制
9
+ - 详细日志记录
10
+ """
11
+
12
+ import logging
13
+ from typing import Callable, Type, Tuple, Any
14
+ from functools import wraps
15
+
16
+ from tenacity import (
17
+ retry,
18
+ stop_after_attempt,
19
+ wait_exponential,
20
+ retry_if_exception_type,
21
+ before_sleep_log,
22
+ after_log,
23
+ RetryError,
24
+ )
25
+
26
+ # 配置日志
27
+ logger = logging.getLogger("llm_retry")
28
+ logger.setLevel(logging.INFO)
29
+
30
+
31
+ # ============================================================================
32
+ # 可重试的异常类型定义
33
+ # ============================================================================
34
+
35
+ # 网络/临时性错误 - 应该重试
36
+ RETRYABLE_EXCEPTIONS: Tuple[Type[Exception], ...] = (
37
+ ConnectionError,
38
+ TimeoutError,
39
+ )
40
+
41
+ # 尝试导入各 SDK 的异常类型
42
+ try:
43
+ from openai import (
44
+ APIConnectionError,
45
+ APITimeoutError,
46
+ RateLimitError,
47
+ InternalServerError,
48
+ )
49
+ RETRYABLE_EXCEPTIONS = RETRYABLE_EXCEPTIONS + (
50
+ APIConnectionError,
51
+ APITimeoutError,
52
+ RateLimitError,
53
+ InternalServerError,
54
+ )
55
+ except ImportError:
56
+ pass
57
+
58
+ try:
59
+ from anthropic import (
60
+ APIConnectionError as AnthropicConnectionError,
61
+ APITimeoutError as AnthropicTimeoutError,
62
+ RateLimitError as AnthropicRateLimitError,
63
+ InternalServerError as AnthropicServerError,
64
+ )
65
+ RETRYABLE_EXCEPTIONS = RETRYABLE_EXCEPTIONS + (
66
+ AnthropicConnectionError,
67
+ AnthropicTimeoutError,
68
+ AnthropicRateLimitError,
69
+ AnthropicServerError,
70
+ )
71
+ except ImportError:
72
+ pass
73
+
74
+ try:
75
+ import httpx
76
+ RETRYABLE_EXCEPTIONS = RETRYABLE_EXCEPTIONS + (
77
+ httpx.ConnectError,
78
+ httpx.ReadTimeout,
79
+ httpx.ConnectTimeout,
80
+ )
81
+ except ImportError:
82
+ pass
83
+
84
+
85
+ # ============================================================================
86
+ # 重试配置
87
+ # ============================================================================
88
+
89
+ class RetryConfig:
90
+ """重试配置"""
91
+ MAX_ATTEMPTS: int = 3 # 最大重试次数
92
+ MIN_WAIT_SECONDS: float = 1.0 # 最小等待时间
93
+ MAX_WAIT_SECONDS: float = 30.0 # 最大等待时间
94
+ EXPONENTIAL_MULTIPLIER: float = 2.0 # 指数退避乘数
95
+
96
+
97
+ # ============================================================================
98
+ # 重试装饰器
99
+ # ============================================================================
100
+
101
+ def create_retry_decorator(
102
+ max_attempts: int = RetryConfig.MAX_ATTEMPTS,
103
+ min_wait: float = RetryConfig.MIN_WAIT_SECONDS,
104
+ max_wait: float = RetryConfig.MAX_WAIT_SECONDS,
105
+ ):
106
+ """
107
+ 创建 LLM 调用重试装饰器
108
+
109
+ Args:
110
+ max_attempts: 最大重试次数
111
+ min_wait: 最小等待时间 (秒)
112
+ max_wait: 最大等待时间 (秒)
113
+
114
+ Returns:
115
+ tenacity retry 装饰器
116
+ """
117
+ return retry(
118
+ # 重试条件: 仅对可重试异常进行重试
119
+ retry=retry_if_exception_type(RETRYABLE_EXCEPTIONS),
120
+ # 停止条件: 达到最大重试次数
121
+ stop=stop_after_attempt(max_attempts),
122
+ # 等待策略: 指数退避
123
+ wait=wait_exponential(
124
+ multiplier=RetryConfig.EXPONENTIAL_MULTIPLIER,
125
+ min=min_wait,
126
+ max=max_wait,
127
+ ),
128
+ # 日志: 重试前记录
129
+ before_sleep=before_sleep_log(logger, logging.WARNING),
130
+ # 日志: 重试后记录
131
+ after=after_log(logger, logging.DEBUG),
132
+ # 重新抛出最后一个异常
133
+ reraise=True,
134
+ )
135
+
136
+
137
+ # 默认的重试装饰器实例
138
+ llm_retry = create_retry_decorator()
139
+
140
+
141
+ def with_retry(func: Callable) -> Callable:
142
+ """
143
+ 为异步函数添加重试能力的装饰器
144
+
145
+ Usage:
146
+ @with_retry
147
+ async def call_llm(...):
148
+ ...
149
+ """
150
+ @wraps(func)
151
+ async def wrapper(*args, **kwargs):
152
+ @llm_retry
153
+ async def _inner():
154
+ return await func(*args, **kwargs)
155
+ return await _inner()
156
+ return wrapper
157
+
158
+
159
+ # ============================================================================
160
+ # 便捷函数
161
+ # ============================================================================
162
+
163
+ async def retry_async(
164
+ coro_func: Callable,
165
+ *args,
166
+ max_attempts: int = RetryConfig.MAX_ATTEMPTS,
167
+ **kwargs
168
+ ) -> Any:
169
+ """
170
+ 带重试的异步调用
171
+
172
+ Usage:
173
+ result = await retry_async(
174
+ client.chat.completions.create,
175
+ model="gpt-4",
176
+ messages=[...]
177
+ )
178
+ """
179
+ decorator = create_retry_decorator(max_attempts=max_attempts)
180
+
181
+ @decorator
182
+ async def _call():
183
+ return await coro_func(*args, **kwargs)
184
+
185
+ return await _call()
186
+
187
+
188
+ def is_retryable_error(error: Exception) -> bool:
189
+ """判断异常是否可重试"""
190
+ return isinstance(error, RETRYABLE_EXCEPTIONS)
191
+
192
+
193
+ def log_retry_info(attempt: int, max_attempts: int, error: Exception, wait_time: float):
194
+ """记录重试信息的辅助函数"""
195
+ logger.warning(
196
+ f"🔄 LLM 调用失败 (尝试 {attempt}/{max_attempts}): {type(error).__name__}: {error}. "
197
+ f"等待 {wait_time:.1f}s 后重试..."
198
+ )
app/utils/session.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Session 工具模块
4
+
5
+ 提供基于仓库 URL 的 Session ID 生成和管理
6
+ """
7
+
8
+ import hashlib
9
+ import re
10
+ from typing import Optional, Tuple, Dict
11
+ from urllib.parse import urlparse
12
+
13
+ from app.core.config import conversation_config
14
+
15
+
16
+ def normalize_repo_url(url: str) -> str:
17
+ """
18
+ 标准化 GitHub 仓库 URL
19
+
20
+ 支持格式:
21
+ - https://github.com/owner/repo
22
+ - https://github.com/owner/repo.git
23
+ - https://github.com/owner/repo/tree/main
24
+ - git@github.com:owner/repo.git
25
+
26
+ Returns:
27
+ 标准化的 URL: https://github.com/owner/repo (全小写)
28
+ """
29
+ url = url.strip().lower() # 统一转为小写
30
+
31
+ # 处理 SSH 格式
32
+ if url.startswith('git@'):
33
+ # git@github.com:owner/repo.git -> https://github.com/owner/repo
34
+ match = re.match(r'git@github\.com:(.+?)(?:\.git)?$', url)
35
+ if match:
36
+ return f"https://github.com/{match.group(1)}"
37
+
38
+ # 处理 HTTPS 格式
39
+ parsed = urlparse(url)
40
+ path = parsed.path.strip('/')
41
+
42
+ # 移除 .git 后缀
43
+ if path.endswith('.git'):
44
+ path = path[:-4]
45
+
46
+ # 只保留 owner/repo 部分
47
+ parts = path.split('/')
48
+ if len(parts) >= 2:
49
+ path = f"{parts[0]}/{parts[1]}"
50
+
51
+ return f"https://github.com/{path}"
52
+
53
+
54
+ def extract_repo_info(url: str) -> Tuple[str, str]:
55
+ """
56
+ 从 URL 提取仓库信息
57
+
58
+ Returns:
59
+ (owner, repo) 元组
60
+ """
61
+ normalized = normalize_repo_url(url)
62
+ path = urlparse(normalized).path.strip('/')
63
+ parts = path.split('/')
64
+
65
+ if len(parts) >= 2:
66
+ return parts[0], parts[1]
67
+ return "", ""
68
+
69
+
70
+ def generate_repo_session_id(repo_url: str) -> str:
71
+ """
72
+ 基于仓库 URL 生成稳定的 Session ID
73
+
74
+ 同一仓库 URL -> 同一 Session ID
75
+
76
+ 格式: repo_{short_hash}_{owner}_{repo}
77
+ """
78
+ normalized = normalize_repo_url(repo_url)
79
+ owner, repo = extract_repo_info(repo_url)
80
+
81
+ # 生成短 hash (8 字符)
82
+ url_hash = hashlib.sha256(normalized.encode()).hexdigest()[:8]
83
+
84
+ # 清理 owner 和 repo 名称
85
+ clean_owner = re.sub(r'[^a-zA-Z0-9]', '', owner)[:10]
86
+ clean_repo = re.sub(r'[^a-zA-Z0-9]', '', repo)[:15]
87
+
88
+ return f"repo_{url_hash}_{clean_owner}_{clean_repo}"
89
+
90
+
91
+ def is_repo_session_id(session_id: str) -> bool:
92
+ """判断是否为仓库级 Session ID"""
93
+ return session_id.startswith("repo_")
94
+
95
+
96
+ # === 对话历史管理 ===
97
+
98
+ class ConversationMemory:
99
+ """
100
+ 对话记忆管理 - 滑动窗口 + 摘要压缩
101
+
102
+ 特性:
103
+ 1. 保留最近 N 轮完整对话
104
+ 2. 早期对话自动压缩为摘要
105
+ 3. 支持 token 估算
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ max_recent_turns: int = None,
111
+ max_context_tokens: int = None,
112
+ summary_threshold: int = None,
113
+ ):
114
+ # 使用统一配置
115
+ self.max_recent_turns = max_recent_turns or conversation_config.max_recent_turns
116
+ self.max_context_tokens = max_context_tokens or conversation_config.max_context_tokens
117
+ self.summary_threshold = summary_threshold or conversation_config.summary_threshold
118
+
119
+ self._messages: list = [] # 完整消息历史
120
+ self._summary: Optional[str] = None # 早期对话摘要
121
+ self._summary_up_to: int = 0 # 摘要覆盖到第 N 条消息
122
+
123
+ def add_message(self, role: str, content: str) -> None:
124
+ """添加消息"""
125
+ self._messages.append({
126
+ "role": role,
127
+ "content": content
128
+ })
129
+
130
+ def add_user_message(self, content: str) -> None:
131
+ """添加用户消息"""
132
+ self.add_message("user", content)
133
+
134
+ def add_assistant_message(self, content: str) -> None:
135
+ """添加助手消息"""
136
+ self.add_message("assistant", content)
137
+
138
+ def get_context_messages(self) -> list:
139
+ """
140
+ 获取用于 LLM 的上下文消息
141
+
142
+ 策略:
143
+ 1. 如果消息数 <= max_recent_turns * 2,返回全部
144
+ 2. 否则返回: [摘要] + 最近 N 轮
145
+ """
146
+ total_messages = len(self._messages)
147
+ max_messages = self.max_recent_turns * 2 # user + assistant = 1 轮
148
+
149
+ if total_messages <= max_messages:
150
+ return list(self._messages)
151
+
152
+ # 需要截断
153
+ recent_messages = self._messages[-max_messages:]
154
+
155
+ # 如果有摘要,加在前面
156
+ if self._summary:
157
+ return [
158
+ {"role": "system", "content": f"[Earlier conversation summary]\n{self._summary}"}
159
+ ] + recent_messages
160
+
161
+ return recent_messages
162
+
163
+ def needs_summarization(self) -> bool:
164
+ """检查是否需要生成摘要"""
165
+ unsummarized = len(self._messages) - self._summary_up_to
166
+ return unsummarized > self.summary_threshold * 2
167
+
168
+ def get_messages_to_summarize(self) -> list:
169
+ """获取需要摘要的消息"""
170
+ if not self.needs_summarization():
171
+ return []
172
+
173
+ # 保留最近的,摘要早期的
174
+ end_idx = len(self._messages) - self.max_recent_turns * 2
175
+ return self._messages[self._summary_up_to:end_idx]
176
+
177
+ def set_summary(self, summary: str, up_to_index: int) -> None:
178
+ """设置摘要"""
179
+ if self._summary:
180
+ # 合并旧摘要
181
+ self._summary = f"{self._summary}\n\n{summary}"
182
+ else:
183
+ self._summary = summary
184
+ self._summary_up_to = up_to_index
185
+
186
+ def clear(self) -> None:
187
+ """清空对话历史"""
188
+ self._messages = []
189
+ self._summary = None
190
+ self._summary_up_to = 0
191
+
192
+ def get_turn_count(self) -> int:
193
+ """获取对话轮数"""
194
+ return len(self._messages) // 2
195
+
196
+ def get_stats(self) -> dict:
197
+ """获取统计信息"""
198
+ return {
199
+ "total_messages": len(self._messages),
200
+ "turn_count": self.get_turn_count(),
201
+ "has_summary": self._summary is not None,
202
+ "summary_covers": self._summary_up_to,
203
+ }
204
+
205
+
206
+ # === 全局对话记忆存储 ===
207
+ # key: session_id, value: ConversationMemory
208
+ # 纯内存存储,服务重启自动清空
209
+ _conversation_memories: Dict[str, ConversationMemory] = {}
210
+
211
+
212
+ def get_conversation_memory(session_id: str) -> ConversationMemory:
213
+ """获取或创建对话记忆"""
214
+ if session_id not in _conversation_memories:
215
+ _conversation_memories[session_id] = ConversationMemory()
216
+ return _conversation_memories[session_id]
217
+
218
+
219
+ def clear_conversation_memory(session_id: str) -> None:
220
+ """清除对话记忆"""
221
+ if session_id in _conversation_memories:
222
+ del _conversation_memories[session_id]
223
+
224
+
225
+ def get_memory_stats() -> dict:
226
+ """获取对话记忆统计"""
227
+ return {
228
+ "total_memories": len(_conversation_memories),
229
+ "sessions": list(_conversation_memories.keys()),
230
+ }
deploy.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ============================================================
3
+ # GitHub RAG Agent - 生产环境部署脚本 (2核2G服务器优化版)
4
+ # ============================================================
5
+ #
6
+ # 使用方法:
7
+ # chmod +x deploy.sh
8
+ # ./deploy.sh
9
+ #
10
+ # 前置要求:
11
+ # - Python 3.10+
12
+ # - Docker (用于运行 Qdrant)
13
+ #
14
+ # ============================================================
15
+
16
+ set -e
17
+
18
+ echo "🚀 GitHub RAG Agent 部署脚本"
19
+ echo "=========================================="
20
+
21
+ # 检查是否在项目目录
22
+ if [ ! -f "requirements.txt" ]; then
23
+ echo "❌ 请在项目根目录运行此脚本"
24
+ exit 1
25
+ fi
26
+
27
+ # 检查 .env 文件
28
+ if [ ! -f ".env" ]; then
29
+ echo "❌ 未找到 .env 文件,请先复制 .env.example 并配置"
30
+ echo " cp .env.example .env"
31
+ echo " vim .env"
32
+ exit 1
33
+ fi
34
+
35
+ # ============================================================
36
+ # 1. 启动 Qdrant Server (Docker)
37
+ # ============================================================
38
+ echo ""
39
+ echo "📦 步骤 1: 启动 Qdrant Server..."
40
+
41
+ # 检查 Docker 是否运行
42
+ if ! docker info > /dev/null 2>&1; then
43
+ echo "❌ Docker 未运行,请先启动 Docker"
44
+ exit 1
45
+ fi
46
+
47
+ # 检查 Qdrant 容器是否已存在
48
+ if docker ps -a --format '{{.Names}}' | grep -q "^qdrant-server$"; then
49
+ echo " Qdrant 容器已存在,检查状态..."
50
+ if docker ps --format '{{.Names}}' | grep -q "^qdrant-server$"; then
51
+ echo " ✅ Qdrant 已在运行"
52
+ else
53
+ echo " 🔄 启动已有的 Qdrant 容器..."
54
+ docker start qdrant-server
55
+ fi
56
+ else
57
+ echo " 🆕 创建并启动 Qdrant 容器 (内存限制 512MB)..."
58
+ docker run -d \
59
+ --name qdrant-server \
60
+ --restart unless-stopped \
61
+ -p 6333:6333 \
62
+ -p 6334:6334 \
63
+ -v qdrant_data:/qdrant/storage \
64
+ -m 512m \
65
+ -e QDRANT__STORAGE__ON_DISK_PAYLOAD=true \
66
+ qdrant/qdrant:latest
67
+ fi
68
+
69
+ # 等待 Qdrant 就绪
70
+ echo " ⏳ 等待 Qdrant 就绪..."
71
+ for i in {1..30}; do
72
+ if curl -s http://localhost:6333/health > /dev/null 2>&1; then
73
+ echo " ✅ Qdrant 已就绪"
74
+ break
75
+ fi
76
+ sleep 1
77
+ done
78
+
79
+ # ============================================================
80
+ # 2. 创建 Python 虚拟环境
81
+ # ============================================================
82
+ echo ""
83
+ echo "🐍 步骤 2: 配置 Python 环境..."
84
+
85
+ if [ ! -d "venv" ]; then
86
+ echo " 创建虚拟环境..."
87
+ python3 -m venv venv
88
+ fi
89
+
90
+ echo " 激活虚拟环境..."
91
+ source venv/bin/activate
92
+
93
+ echo " 安装依赖..."
94
+ pip install -q --upgrade pip
95
+ pip install -q -r requirements.txt
96
+
97
+ # ============================================================
98
+ # 3. 创建必要目录
99
+ # ============================================================
100
+ echo ""
101
+ echo "📁 步骤 3: 创建数据目录..."
102
+ mkdir -p data/locks
103
+ mkdir -p data/contexts
104
+ mkdir -p logs
105
+
106
+ # ============================================================
107
+ # 4. 设置环境变量
108
+ # ============================================================
109
+ echo ""
110
+ echo "⚙️ 步骤 4: 配置环境变量..."
111
+
112
+ # 从 .env 加载
113
+ set -a
114
+ source .env
115
+ set +a
116
+
117
+ # 设置 Server 模式
118
+ export QDRANT_MODE=server
119
+ export QDRANT_URL=http://localhost:6333
120
+ export LOCK_BACKEND=file
121
+ export LOCK_DIR=data/locks
122
+ export GUNICORN_WORKERS=2
123
+
124
+ echo " QDRANT_MODE=$QDRANT_MODE"
125
+ echo " QDRANT_URL=$QDRANT_URL"
126
+ echo " GUNICORN_WORKERS=$GUNICORN_WORKERS"
127
+
128
+ # ============================================================
129
+ # 5. 启动应用
130
+ # ============================================================
131
+ echo ""
132
+ echo "🌐 步骤 5: 启动 FastAPI 应用..."
133
+ echo "=========================================="
134
+ echo " Workers: 2 (优化2核CPU)"
135
+ echo " 监听地址: 0.0.0.0:8000"
136
+ echo " Qdrant: http://localhost:6333"
137
+ echo "=========================================="
138
+ echo ""
139
+ echo " 按 Ctrl+C 停止服务"
140
+ echo ""
141
+
142
+ # 使用 Gunicorn 启动 (2 Workers)
143
+ gunicorn app.main:app -c gunicorn_conf.py
docker-compose.yml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Docker Compose 配置 - 生产环境部署 (优化版: 2核2G服务器)
2
+ # 包含: FastAPI 应用 + Qdrant Server
3
+
4
+ version: '3.8'
5
+
6
+ services:
7
+ # ============================================================
8
+ # Qdrant 向量数据库 (限制内存 512MB)
9
+ # ============================================================
10
+ qdrant:
11
+ image: qdrant/qdrant:latest
12
+ container_name: github-rag-qdrant
13
+ restart: unless-stopped
14
+ ports:
15
+ - "6333:6333" # REST API
16
+ - "6334:6334" # gRPC
17
+ volumes:
18
+ - qdrant_data:/qdrant/storage
19
+ environment:
20
+ - QDRANT__SERVICE__GRPC_PORT=6334
21
+ - QDRANT__STORAGE__ON_DISK_PAYLOAD=true # Payload 存磁盘,省内存
22
+ deploy:
23
+ resources:
24
+ limits:
25
+ memory: 512M
26
+ reservations:
27
+ memory: 256M
28
+ healthcheck:
29
+ test: ["CMD", "curl", "-f", "http://localhost:6333/health"]
30
+ interval: 30s
31
+ timeout: 10s
32
+ retries: 3
33
+
34
+ # ============================================================
35
+ # FastAPI 应用 (2 Workers, 限制内存 1GB)
36
+ # ============================================================
37
+ app:
38
+ build:
39
+ context: .
40
+ dockerfile: Dockerfile
41
+ container_name: github-rag-app
42
+ restart: unless-stopped
43
+ ports:
44
+ - "8000:8000"
45
+ environment:
46
+ # Qdrant Server 模式
47
+ - QDRANT_MODE=server
48
+ - QDRANT_URL=http://qdrant:6333
49
+
50
+ # Worker 数量 (2核服务器建议2个)
51
+ - GUNICORN_WORKERS=2
52
+
53
+ # 文件锁 (多 Worker)
54
+ - LOCK_BACKEND=file
55
+ - LOCK_DIR=/app/data/locks
56
+
57
+ # LLM 配置 (从 .env 读取)
58
+ - LLM_PROVIDER=${LLM_PROVIDER:-deepseek}
59
+ - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY}
60
+ - OPENAI_API_KEY=${OPENAI_API_KEY}
61
+ - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
62
+ - GEMINI_API_KEY=${GEMINI_API_KEY}
63
+ - SILICON_API_KEY=${SILICON_API_KEY}
64
+ - GITHUB_TOKEN=${GITHUB_TOKEN}
65
+ volumes:
66
+ - app_data:/app/data
67
+ - app_logs:/app/logs
68
+ deploy:
69
+ resources:
70
+ limits:
71
+ memory: 1G
72
+ reservations:
73
+ memory: 512M
74
+ depends_on:
75
+ qdrant:
76
+ condition: service_healthy
77
+ healthcheck:
78
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
79
+ interval: 30s
80
+ timeout: 10s
81
+ retries: 3
82
+
83
+ volumes:
84
+ qdrant_data:
85
+ driver: local
86
+ app_data:
87
+ driver: local
88
+ app_logs:
89
+ driver: local
90
+
91
+ # ============================================================
92
+ # 使用说明
93
+ # ============================================================
94
+ # 1. 复制 .env.example 为 .env 并配置 API Keys
95
+ # 2. 启动服务: docker-compose up -d
96
+ # 3. 查看日志: docker-compose logs -f app
97
+ # 4. 停止服务: docker-compose down
98
+ #
99
+ # 扩展到多 Worker:
100
+ # 修改 Dockerfile 中的 gunicorn workers 数量,或使用:
101
+ # docker-compose up -d --scale app=3
102
+ # 配合 Nginx/Traefik 做负载均衡
evaluation/__init__.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluation/__init__.py
2
+ """
3
+ Evaluation 模块
4
+
5
+ 提供完整的评估框架,包括:
6
+ - 数据模型 (models.py)
7
+ - 评估引擎 (evaluation_framework.py)
8
+ - 数据路由 (data_router.py)
9
+ - 工具函数 (utils.py)
10
+ - 数据分析 (analyze_eval_results.py)
11
+ - 数据清洗 (clean_and_export_sft_data.py)
12
+
13
+ 使用示例:
14
+ from evaluation import EvaluationEngine, DataRoutingEngine, EvaluationResult
15
+ from evaluation.models import GenerationMetrics
16
+ """
17
+
18
+ # 核心导出
19
+ from evaluation.models import (
20
+ EvaluationLayer,
21
+ DataQualityTier,
22
+ QueryRewriteMetrics,
23
+ RetrievalMetrics,
24
+ GenerationMetrics,
25
+ AgenticMetrics,
26
+ EvaluationResult,
27
+ )
28
+
29
+ from evaluation.data_router import DataRoutingEngine
30
+ from evaluation.evaluation_framework import EvaluationEngine
31
+
32
+ # 工具函数
33
+ from evaluation.utils import (
34
+ is_chatty_query,
35
+ has_code_indicators,
36
+ read_jsonl,
37
+ append_jsonl,
38
+ safe_truncate,
39
+ smart_truncate,
40
+ SFTLengthConfig,
41
+ )
42
+
43
+ __all__ = [
44
+ # 枚举
45
+ "EvaluationLayer",
46
+ "DataQualityTier",
47
+ # 数据模型
48
+ "QueryRewriteMetrics",
49
+ "RetrievalMetrics",
50
+ "GenerationMetrics",
51
+ "AgenticMetrics",
52
+ "EvaluationResult",
53
+ # 引擎
54
+ "EvaluationEngine",
55
+ "DataRoutingEngine",
56
+ # 工具函数
57
+ "is_chatty_query",
58
+ "has_code_indicators",
59
+ "read_jsonl",
60
+ "append_jsonl",
61
+ "safe_truncate",
62
+ "smart_truncate",
63
+ "SFTLengthConfig",
64
+ ]
evaluation/analyze_eval_results.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: evaluation/analyze_eval_results.py
2
+ """
3
+ 自动化数据分析脚本
4
+ 用于分析评估结果,识别问题并生成诊断报告
5
+
6
+ 核心功能:
7
+ 1. 自动读取所有评估结果
8
+ 2. 按问题类型分类 Bad Case
9
+ 3. 生成可视化报告
10
+ 4. 推荐优化方向
11
+
12
+ Author: Dexter
13
+ Date: 2025-01-27
14
+ """
15
+
16
+ import os
17
+ from typing import Dict, List
18
+ from collections import Counter, defaultdict
19
+ from datetime import datetime
20
+
21
+ from evaluation.utils import read_jsonl
22
+
23
+
24
+ class EvaluationAnalyzer:
25
+ """评估结果分析器"""
26
+
27
+ def __init__(self, eval_results_file: str = "evaluation/sft_data/eval_results.jsonl"):
28
+ self.eval_results_file = eval_results_file
29
+ self.results: List[Dict] = read_jsonl(eval_results_file)
30
+ if not self.results:
31
+ print(f"⚠️ No results loaded from: {eval_results_file}")
32
+
33
+ def get_basic_stats(self) -> Dict:
34
+ """获取基本统计"""
35
+ if not self.results:
36
+ return {}
37
+
38
+ scores = [r.get("overall_score", 0) for r in self.results]
39
+ tiers = [r.get("data_quality_tier", "unknown") for r in self.results]
40
+
41
+ return {
42
+ "total_evaluations": len(self.results),
43
+ "avg_score": sum(scores) / len(scores) if scores else 0,
44
+ "max_score": max(scores) if scores else 0,
45
+ "min_score": min(scores) if scores else 0,
46
+ "median_score": sorted(scores)[len(scores)//2] if scores else 0,
47
+ "quality_distribution": dict(Counter(tiers)),
48
+ "sft_ready_count": sum(1 for r in self.results if r.get("sft_ready", False))
49
+ }
50
+
51
+ def identify_bad_cases(self, threshold: float = 0.6) -> List[Dict]:
52
+ """
53
+ 识别 Bad Case (得分低于阈值的结果)
54
+ 返回按得分排序的结果
55
+ """
56
+ bad_cases = [r for r in self.results if r.get("overall_score", 1) < threshold]
57
+ return sorted(bad_cases, key=lambda x: x.get("overall_score", 1))
58
+
59
+ def categorize_failures(self) -> Dict[str, List[Dict]]:
60
+ """
61
+ 按失败原因分类 Bad Case
62
+
63
+ 失败类型:
64
+ - retrieval_failure: 检索未命中
65
+ - generation_hallucination: 生成幻觉
66
+ - generation_incomplete: 回答不完整
67
+ - tool_call_error: 工具调用失败
68
+ """
69
+ categorized = defaultdict(list)
70
+
71
+ for result in self.identify_bad_cases():
72
+ reasons = []
73
+
74
+ # 检查检索失败
75
+ if result.get("retrieval"):
76
+ retrieval = result["retrieval"]
77
+ if retrieval.get("hit_rate", 1) == 0:
78
+ reasons.append("retrieval_failure")
79
+ elif retrieval.get("recall_at_k", 1) < 0.5:
80
+ reasons.append("retrieval_low_recall")
81
+
82
+ # 检查生成问题
83
+ if result.get("generation"):
84
+ generation = result["generation"]
85
+ if generation.get("faithfulness", 1) < 0.5:
86
+ reasons.append("generation_hallucination")
87
+ if generation.get("answer_completeness", 1) < 0.4:
88
+ reasons.append("generation_incomplete")
89
+ if generation.get("hallucination_count", 0) > 0:
90
+ reasons.append("hallucination_detected")
91
+
92
+ # 检查Agent行为
93
+ if result.get("agentic"):
94
+ agentic = result["agentic"]
95
+ if not agentic.get("success", True):
96
+ reasons.append("agentic_failure")
97
+
98
+ # 如果没有具体原因,标记为unknown
99
+ if not reasons:
100
+ reasons.append("unknown")
101
+
102
+ for reason in reasons:
103
+ categorized[reason].append(result)
104
+
105
+ return dict(categorized)
106
+
107
+ def layer_performance(self) -> Dict[str, Dict]:
108
+ """分析各层性能"""
109
+ layer_scores = defaultdict(list)
110
+
111
+ for result in self.results:
112
+ if result.get("query_rewrite"):
113
+ score = result["query_rewrite"].get("overall_score", 0)
114
+ if score:
115
+ layer_scores["query_rewrite"].append(score)
116
+
117
+ if result.get("retrieval"):
118
+ score = result["retrieval"].get("overall_score", 0)
119
+ if score:
120
+ layer_scores["retrieval"].append(score)
121
+
122
+ if result.get("generation"):
123
+ score = result["generation"].get("overall_score", 0)
124
+ if score:
125
+ layer_scores["generation"].append(score)
126
+
127
+ if result.get("agentic"):
128
+ score = result["agentic"].get("overall_score", 0)
129
+ if score:
130
+ layer_scores["agentic"].append(score)
131
+
132
+ # 计算每层的统计
133
+ layer_stats = {}
134
+ for layer, scores in layer_scores.items():
135
+ if scores:
136
+ layer_stats[layer] = {
137
+ "avg": sum(scores) / len(scores),
138
+ "min": min(scores),
139
+ "max": max(scores),
140
+ "count": len(scores)
141
+ }
142
+
143
+ return layer_stats
144
+
145
+ def get_recommendations(self) -> List[str]:
146
+ """基于分析结果生成优化建议"""
147
+ recommendations = []
148
+
149
+ # 分析各层性能
150
+ layer_perf = self.layer_performance()
151
+
152
+ # 检索层分析
153
+ if "retrieval" in layer_perf:
154
+ retrieval_score = layer_perf["retrieval"]["avg"]
155
+ if retrieval_score < 0.7:
156
+ recommendations.append(
157
+ "🔴 RETRIEVAL 层性能差 (avg: {:.2f})\n"
158
+ " 建议:\n"
159
+ " 1. 检查 chunking 策略是否过度分割\n"
160
+ " 2. 优化 embedding 模型 (考虑更强的模型)\n"
161
+ " 3. 调整混合检索的权重 (BM25 vs Vector)\n"
162
+ " 4. 分析实际召回的文件,看是否与预期偏离".format(retrieval_score)
163
+ )
164
+
165
+ # 生成层分析
166
+ if "generation" in layer_perf:
167
+ gen_score = layer_perf["generation"]["avg"]
168
+ if gen_score < 0.7:
169
+ recommendations.append(
170
+ "🟡 GENERATION 层存在问题 (avg: {:.2f})\n"
171
+ " 建议:\n"
172
+ " 1. 检查 Prompt 是否清晰 (可能LLM理解偏差)\n"
173
+ " 2. 检查是否存在幻觉 (生成不存在的函数名等)\n"
174
+ " 3. 优化 Context 的组织方式\n"
175
+ " 4. 考虑使用更强的LLM模型".format(gen_score)
176
+ )
177
+
178
+ # Query Rewrite 分析
179
+ if "query_rewrite" in layer_perf:
180
+ rewrite_score = layer_perf["query_rewrite"]["avg"]
181
+ if rewrite_score < 0.6:
182
+ recommendations.append(
183
+ "🟠 QUERY_REWRITE 层准确度低 (avg: {:.2f})\n"
184
+ " 建议:\n"
185
+ " 1. 优化关键词提取 Prompt\n"
186
+ " 2. 增加多语言处理支持\n"
187
+ " 3. 添加领域词汇表 (Domain Vocabulary)".format(rewrite_score)
188
+ )
189
+
190
+ # 通用建议
191
+ stats = self.get_basic_stats()
192
+ if stats.get("sft_ready_count", 0) / max(stats.get("total_evaluations", 1), 1) < 0.5:
193
+ recommendations.append(
194
+ "⚠️ SFT 可用数据不足 (< 50%)\n"
195
+ " 立即行动:\n"
196
+ " 1. 运行 continuous_eval 脚本收集更多数据\n"
197
+ " 2. 对现有数据进行自纠正 (Self-Correction)\n"
198
+ " 3. 扩展黄金数据集来改进模型"
199
+ )
200
+
201
+ return recommendations
202
+
203
+ def generate_report(self, output_file: str = "evaluation/analysis_report.md") -> str:
204
+ """生成完整的分析报告"""
205
+
206
+ report = []
207
+ report.append("# 📊 GitHub Agent 评估分析报告\n")
208
+ report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
209
+ report.append("---\n")
210
+
211
+ # 1. 基本统计
212
+ stats = self.get_basic_stats()
213
+ report.append("## 📈 基本统计\n")
214
+ report.append(f"- 总评估次数: {stats.get('total_evaluations', 0)}\n")
215
+ report.append(f"- 平均得分: {stats.get('avg_score', 0):.3f}\n")
216
+ report.append(f"- 最高得分: {stats.get('max_score', 0):.3f}\n")
217
+ report.append(f"- 最低得分: {stats.get('min_score', 0):.3f}\n")
218
+ report.append(f"- 中位数得分: {stats.get('median_score', 0):.3f}\n")
219
+ report.append(f"- SFT 可用样本: {stats.get('sft_ready_count', 0)}\n\n")
220
+
221
+ # 2. 质量分级分布
222
+ report.append("## 🏆 质量分级分布\n")
223
+ distribution = stats.get("quality_distribution", {})
224
+ for tier, count in sorted(distribution.items()):
225
+ percentage = (count / stats.get('total_evaluations', 1)) * 100
226
+ report.append(f"- {tier.upper()}: {count} ({percentage:.1f}%)\n")
227
+ report.append("\n")
228
+
229
+ # 3. 各层性能
230
+ report.append("## 🎯 各层性能分析\n\n")
231
+ layer_perf = self.layer_performance()
232
+ for layer in ["query_rewrite", "retrieval", "generation", "agentic"]:
233
+ if layer in layer_perf:
234
+ perf = layer_perf[layer]
235
+ report.append(f"### {layer.upper()}\n")
236
+ report.append(f"- 平均得分: {perf['avg']:.3f}\n")
237
+ report.append(f"- 范围: [{perf['min']:.3f}, {perf['max']:.3f}]\n")
238
+ report.append(f"- 样本数: {perf['count']}\n\n")
239
+
240
+ # 4. Bad Case 分类
241
+ report.append("## 🔴 Bad Case 分析\n\n")
242
+ failures = self.categorize_failures()
243
+ for reason, cases in sorted(failures.items(), key=lambda x: -len(x[1])):
244
+ report.append(f"### {reason} ({len(cases)} cases)\n")
245
+ for case in cases[:3]: # 显示top 3
246
+ report.append(f"- 查询: {case.get('query', 'N/A')[:60]}...\n")
247
+ report.append(f" 得分: {case.get('overall_score', 0):.3f}\n")
248
+ report.append("\n")
249
+
250
+ # 5. 推荐行动
251
+ report.append("## 💡 优化建议\n\n")
252
+ recommendations = self.get_recommendations()
253
+ for i, rec in enumerate(recommendations, 1):
254
+ report.append(f"{i}. {rec}\n\n")
255
+
256
+ # 写入文件
257
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
258
+ with open(output_file, 'w', encoding='utf-8') as f:
259
+ f.writelines(report)
260
+
261
+ return "".join(report)
262
+
263
+ def export_bad_cases_csv(self, output_file: str = "evaluation/bad_cases.csv") -> None:
264
+ """导出 Bad Case 为 CSV (用于人工审查)"""
265
+ import csv
266
+
267
+ bad_cases = self.identify_bad_cases()
268
+
269
+ with open(output_file, 'w', newline='', encoding='utf-8') as f:
270
+ writer = csv.DictWriter(f, fieldnames=[
271
+ "query", "overall_score", "tier",
272
+ "retrieval_score", "generation_score", "agentic_score",
273
+ "error_message", "timestamp"
274
+ ])
275
+
276
+ writer.writeheader()
277
+ for case in bad_cases:
278
+ writer.writerow({
279
+ "query": case.get("query", ""),
280
+ "overall_score": case.get("overall_score", 0),
281
+ "tier": case.get("data_quality_tier", "unknown"),
282
+ "retrieval_score": case.get("retrieval", {}).get("overall_score", 0),
283
+ "generation_score": case.get("generation", {}).get("overall_score", 0),
284
+ "agentic_score": case.get("agentic", {}).get("overall_score", 0),
285
+ "error_message": case.get("error_message", ""),
286
+ "timestamp": case.get("timestamp", "")
287
+ })
288
+
289
+ print(f"✅ Exported {len(bad_cases)} bad cases to {output_file}")
290
+
291
+
292
+ # ============================================================================
293
+ # 命令行工具
294
+ # ============================================================================
295
+
296
+ def print_summary(analyzer: EvaluationAnalyzer):
297
+ """打印摘要"""
298
+ print("\n" + "=" * 70)
299
+ print("📊 评估结果摘要")
300
+ print("=" * 70)
301
+
302
+ stats = analyzer.get_basic_stats()
303
+
304
+ print(f"\n📈 基本统计:")
305
+ print(f" 总评估: {stats.get('total_evaluations', 0)}")
306
+ print(f" 平均分: {stats.get('avg_score', 0):.3f}")
307
+ print(f" 分布: {stats.get('quality_distribution', {})}")
308
+ print(f" SFT可用: {stats.get('sft_ready_count', 0)}")
309
+
310
+ print(f"\n🎯 各层性能:")
311
+ layer_perf = analyzer.layer_performance()
312
+ for layer, perf in layer_perf.items():
313
+ print(f" {layer:.<30} {perf['avg']:.3f} (avg)")
314
+
315
+ print(f"\n🔴 Bad Case Top 5:")
316
+ bad_cases = analyzer.identify_bad_cases()[:5]
317
+ for i, case in enumerate(bad_cases, 1):
318
+ print(f" {i}. {case.get('query', 'N/A')[:40]:<40} Score: {case.get('overall_score', 0):.3f}")
319
+
320
+ print(f"\n💡 优化建议:")
321
+ recommendations = analyzer.get_recommendations()
322
+ for rec in recommendations[:3]:
323
+ print(f" - {rec.split(chr(10))[0]}")
324
+
325
+ print("\n" + "=" * 70)
326
+
327
+
328
+ def main():
329
+ import sys
330
+
331
+ analyzer = EvaluationAnalyzer()
332
+
333
+ if len(sys.argv) > 1:
334
+ command = sys.argv[1]
335
+
336
+ if command == "summary":
337
+ print_summary(analyzer)
338
+
339
+ elif command == "report":
340
+ report = analyzer.generate_report()
341
+ print(report)
342
+
343
+ elif command == "bad-cases":
344
+ analyzer.export_bad_cases_csv()
345
+ bad_cases = analyzer.identify_bad_cases()
346
+ print(f"\n✅ Found {len(bad_cases)} bad cases")
347
+ print("详见 evaluation/bad_cases.csv")
348
+
349
+ elif command == "layer-perf":
350
+ layer_perf = analyzer.layer_performance()
351
+ print("\n🎯 各层性能:")
352
+ for layer, perf in layer_perf.items():
353
+ print(f"\n{layer.upper()}:")
354
+ print(f" Average: {perf['avg']:.3f}")
355
+ print(f" Range: [{perf['min']:.3f}, {perf['max']:.3f}]")
356
+ print(f" Samples: {perf['count']}")
357
+
358
+ elif command == "recommendations":
359
+ recs = analyzer.get_recommendations()
360
+ print("\n💡 优化建议:\n")
361
+ for i, rec in enumerate(recs, 1):
362
+ print(f"{i}.\n{rec}\n")
363
+
364
+ else:
365
+ print(f"Unknown command: {command}")
366
+
367
+ else:
368
+ print("自动化评估数据分析工具")
369
+ print()
370
+ print("用法:")
371
+ print(" python analyze_eval_results.py summary # 快速摘要")
372
+ print(" python analyze_eval_results.py report # 生成完整报告")
373
+ print(" python analyze_eval_results.py bad-cases # 导出Bad Case")
374
+ print(" python analyze_eval_results.py layer-perf # 各层性能分析")
375
+ print(" python analyze_eval_results.py recommendations # 优化建议")
376
+
377
+
378
+ if __name__ == "__main__":
379
+ main()
evaluation/clean_and_export_sft_data.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ SFT 数据清洗与导出脚本
4
+
5
+ 功能:
6
+ 1. 从 eval_results.jsonl 读取原始评估数据
7
+ 2. 应用严格的质量过滤规则
8
+ 3. 转换为标准 SFT 训练格式
9
+ 4. 导出为可直接用于训练的数据集
10
+
11
+ Author: Dexter
12
+ Date: 2026-01-28
13
+ """
14
+
15
+ import json
16
+ import os
17
+ from datetime import datetime
18
+ from typing import Dict, List, Tuple
19
+ from pathlib import Path
20
+
21
+ from evaluation.utils import is_chatty_query, has_code_indicators
22
+
23
+
24
+ # ============================================================================
25
+ # 配置
26
+ # ============================================================================
27
+
28
+ class CleaningConfig:
29
+ """数据清洗配置"""
30
+ # 质量阈值
31
+ MIN_OVERALL_SCORE = 0.7 # 最低综合分
32
+ MIN_FAITHFULNESS = 0.6 # 最低 faithfulness
33
+ MIN_ANSWER_RELEVANCE = 0.6 # 最低 answer_relevance
34
+
35
+ # 长度阈值
36
+ MIN_QUERY_LENGTH = 10 # 最短 query
37
+ MIN_ANSWER_LENGTH = 100 # 最短 answer
38
+ MIN_CONTEXT_LENGTH = 50 # 最短 context
39
+ MAX_CONTEXT_LENGTH = 4000 # 最长 context(截断)
40
+
41
+ # 必须条件
42
+ REQUIRE_REPO_URL = True # 必须有仓库 URL
43
+ REQUIRE_CODE_IN_CONTEXT = True # 上下文必须包含代码
44
+
45
+ # 输出配置
46
+ OUTPUT_DIR = "evaluation/sft_data/cleaned"
47
+
48
+
49
+ # ============================================================================
50
+ # 数据清洗逻辑
51
+ # ============================================================================
52
+
53
+ def validate_sample(sample: Dict, config: CleaningConfig) -> Tuple[bool, str]:
54
+ """
55
+ 验证单个样本是否符合质量标准
56
+
57
+ Returns:
58
+ (is_valid, rejection_reason)
59
+ """
60
+ # 1. 检查基本字段存在
61
+ if not sample.get("query"):
62
+ return False, "missing_query"
63
+
64
+ if not sample.get("generation"):
65
+ return False, "missing_generation"
66
+
67
+ gen = sample["generation"]
68
+
69
+ # 2. 检查 repo_url
70
+ if config.REQUIRE_REPO_URL and not sample.get("repo_url"):
71
+ return False, "missing_repo_url"
72
+
73
+ # 3. 检查质量分数
74
+ overall_score = sample.get("overall_score", 0)
75
+ if overall_score < config.MIN_OVERALL_SCORE:
76
+ return False, f"low_score:{overall_score:.2f}"
77
+
78
+ faithfulness = gen.get("faithfulness", 0)
79
+ if faithfulness < config.MIN_FAITHFULNESS:
80
+ return False, f"low_faithfulness:{faithfulness:.2f}"
81
+
82
+ answer_relevance = gen.get("answer_relevance", 0)
83
+ if answer_relevance < config.MIN_ANSWER_RELEVANCE:
84
+ return False, f"low_relevance:{answer_relevance:.2f}"
85
+
86
+ # 4. 检查长度
87
+ query = sample.get("query", "")
88
+ if len(query) < config.MIN_QUERY_LENGTH:
89
+ return False, f"short_query:{len(query)}"
90
+
91
+ answer = gen.get("generated_answer", "")
92
+ if len(answer) < config.MIN_ANSWER_LENGTH:
93
+ return False, f"short_answer:{len(answer)}"
94
+
95
+ context = gen.get("retrieved_context", "")
96
+ if len(context) < config.MIN_CONTEXT_LENGTH:
97
+ return False, f"short_context:{len(context)}"
98
+
99
+ # 5. 检查闲聊
100
+ if is_chatty_query(query):
101
+ return False, "chatty_query"
102
+
103
+ # 6. 检查代码存在
104
+ if config.REQUIRE_CODE_IN_CONTEXT and not has_code_indicators(context):
105
+ return False, "no_code_in_context"
106
+
107
+ return True, "passed"
108
+
109
+
110
+ def transform_to_sft_format(sample: Dict, config: CleaningConfig) -> Dict:
111
+ """
112
+ 将原始评估数据转换为标准 SFT 格式
113
+ """
114
+ gen = sample["generation"]
115
+
116
+ # 清理和截断 context
117
+ context = gen.get("retrieved_context", "")
118
+ if len(context) > config.MAX_CONTEXT_LENGTH:
119
+ context = context[:config.MAX_CONTEXT_LENGTH] + "\n... [truncated]"
120
+
121
+ # 构建标准 SFT 格式
122
+ sft_sample = {
123
+ # === 核心训练字段 ===
124
+ "instruction": "你是一个专业的GitHub代码仓库分析助手。根据提供的代码上下文,准确回答用户关于代码实现、架构设计、功能逻辑等问题。回答时应该:1) 直接引用相关代码 2) 解释代码的工作原理 3) 如有必要,提供代码示例。",
125
+ "input": f"[用户问题]\n{sample['query']}\n\n[代码上下文]\n{context}",
126
+ "output": gen.get("generated_answer", ""),
127
+
128
+ # === 元数据 ===
129
+ "metadata": {
130
+ "query": sample["query"],
131
+ "repo_url": sample.get("repo_url", ""),
132
+ "language": sample.get("language", "en"),
133
+ "session_id": sample.get("session_id", ""),
134
+ "timestamp": sample.get("timestamp", ""),
135
+ "quality_tier": sample.get("data_quality_tier", ""),
136
+ "overall_score": sample.get("overall_score", 0),
137
+ "faithfulness": gen.get("faithfulness", 0),
138
+ "answer_relevance": gen.get("answer_relevance", 0),
139
+ "answer_completeness": gen.get("answer_completeness", 0),
140
+ "code_correctness": gen.get("code_correctness", 0),
141
+ }
142
+ }
143
+
144
+ return sft_sample
145
+
146
+
147
+ def clean_and_export(
148
+ input_file: str = "evaluation/sft_data/eval_results.jsonl",
149
+ config: CleaningConfig = None
150
+ ) -> Dict:
151
+ """
152
+ 清洗数据并导出
153
+
154
+ Returns:
155
+ 统计信息
156
+ """
157
+ config = config or CleaningConfig()
158
+
159
+ # 创建输出目录
160
+ output_dir = Path(config.OUTPUT_DIR)
161
+ output_dir.mkdir(parents=True, exist_ok=True)
162
+
163
+ # 统计
164
+ stats = {
165
+ "total_read": 0,
166
+ "passed": 0,
167
+ "rejected": 0,
168
+ "rejection_reasons": {},
169
+ "quality_distribution": {"gold": 0, "silver": 0, "bronze": 0}
170
+ }
171
+
172
+ # 输出文件
173
+ output_file = output_dir / f"sft_train_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
174
+ rejected_file = output_dir / f"rejected_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
175
+
176
+ print("=" * 60)
177
+ print("🧹 SFT 数据清洗与导出")
178
+ print("=" * 60)
179
+ print(f"输入文件: {input_file}")
180
+ print(f"输出目录: {output_dir}")
181
+ print(f"质量阈值: score >= {config.MIN_OVERALL_SCORE}")
182
+ print()
183
+
184
+ if not os.path.exists(input_file):
185
+ print(f"❌ 输入文件不存在: {input_file}")
186
+ return stats
187
+
188
+ passed_samples = []
189
+ rejected_samples = []
190
+
191
+ # 读取并处理
192
+ with open(input_file, 'r', encoding='utf-8') as f:
193
+ for line_num, line in enumerate(f, 1):
194
+ try:
195
+ sample = json.loads(line)
196
+ stats["total_read"] += 1
197
+
198
+ # 验证
199
+ is_valid, reason = validate_sample(sample, config)
200
+
201
+ if is_valid:
202
+ # 转换格式
203
+ sft_sample = transform_to_sft_format(sample, config)
204
+ passed_samples.append(sft_sample)
205
+ stats["passed"] += 1
206
+
207
+ # 统计质量分布
208
+ score = sample.get("overall_score", 0)
209
+ if score > 0.9:
210
+ stats["quality_distribution"]["gold"] += 1
211
+ elif score > 0.7:
212
+ stats["quality_distribution"]["silver"] += 1
213
+ else:
214
+ stats["quality_distribution"]["bronze"] += 1
215
+ else:
216
+ rejected_samples.append({
217
+ "reason": reason,
218
+ "query": sample.get("query", "")[:50],
219
+ "score": sample.get("overall_score", 0)
220
+ })
221
+ stats["rejected"] += 1
222
+ stats["rejection_reasons"][reason] = stats["rejection_reasons"].get(reason, 0) + 1
223
+
224
+ except json.JSONDecodeError as e:
225
+ print(f" ⚠️ 第 {line_num} 行 JSON 解析错误: {e}")
226
+ continue
227
+
228
+ # 写入通过的样本
229
+ if passed_samples:
230
+ with open(output_file, 'w', encoding='utf-8') as f:
231
+ for sample in passed_samples:
232
+ f.write(json.dumps(sample, ensure_ascii=False) + '\n')
233
+ print(f"✅ 已导出 {len(passed_samples)} 条高质量样本到: {output_file}")
234
+
235
+ # 写入拒绝的样本(用于分析)
236
+ if rejected_samples:
237
+ with open(rejected_file, 'w', encoding='utf-8') as f:
238
+ for sample in rejected_samples:
239
+ f.write(json.dumps(sample, ensure_ascii=False) + '\n')
240
+ print(f"📝 已记录 {len(rejected_samples)} 条被拒绝样本到: {rejected_file}")
241
+
242
+ # 打印统计
243
+ print()
244
+ print("=" * 60)
245
+ print("📊 统计报告")
246
+ print("=" * 60)
247
+ print(f"总读取: {stats['total_read']}")
248
+ print(f"通过: {stats['passed']} ({stats['passed']/max(stats['total_read'],1)*100:.1f}%)")
249
+ print(f"拒绝: {stats['rejected']} ({stats['rejected']/max(stats['total_read'],1)*100:.1f}%)")
250
+ print()
251
+ print("质量分布:")
252
+ print(f" 🥇 Gold (>0.9): {stats['quality_distribution']['gold']}")
253
+ print(f" 🥈 Silver (>0.7): {stats['quality_distribution']['silver']}")
254
+ print(f" 🥉 Bronze (>0.5): {stats['quality_distribution']['bronze']}")
255
+ print()
256
+
257
+ if stats["rejection_reasons"]:
258
+ print("拒绝原因分布:")
259
+ for reason, count in sorted(stats["rejection_reasons"].items(), key=lambda x: -x[1]):
260
+ print(f" - {reason}: {count}")
261
+
262
+ print()
263
+ print("=" * 60)
264
+
265
+ return stats
266
+
267
+
268
+ def export_for_training(
269
+ input_file: str,
270
+ output_file: str,
271
+ format_type: str = "alpaca"
272
+ ) -> int:
273
+ """
274
+ 将清洗后的数据导出为特定训练格式
275
+
276
+ Args:
277
+ input_file: 清洗后的 JSONL 文件
278
+ output_file: 输出文件
279
+ format_type: 格式类型 (alpaca, sharegpt, messages)
280
+
281
+ Returns:
282
+ 导出的样本数量
283
+ """
284
+ samples = []
285
+
286
+ with open(input_file, 'r', encoding='utf-8') as f:
287
+ for line in f:
288
+ sample = json.loads(line)
289
+
290
+ if format_type == "alpaca":
291
+ # Alpaca 格式(适用于 LLaMA-Factory 等)
292
+ formatted = {
293
+ "instruction": sample["instruction"],
294
+ "input": sample["input"],
295
+ "output": sample["output"]
296
+ }
297
+
298
+ elif format_type == "sharegpt":
299
+ # ShareGPT 格式
300
+ formatted = {
301
+ "conversations": [
302
+ {"from": "system", "value": sample["instruction"]},
303
+ {"from": "human", "value": sample["input"]},
304
+ {"from": "gpt", "value": sample["output"]}
305
+ ]
306
+ }
307
+
308
+ elif format_type == "messages":
309
+ # OpenAI messages 格式
310
+ formatted = {
311
+ "messages": [
312
+ {"role": "system", "content": sample["instruction"]},
313
+ {"role": "user", "content": sample["input"]},
314
+ {"role": "assistant", "content": sample["output"]}
315
+ ]
316
+ }
317
+
318
+ else:
319
+ formatted = sample
320
+
321
+ samples.append(formatted)
322
+
323
+ # 写入
324
+ with open(output_file, 'w', encoding='utf-8') as f:
325
+ if output_file.endswith('.json'):
326
+ json.dump(samples, f, ensure_ascii=False, indent=2)
327
+ else:
328
+ for sample in samples:
329
+ f.write(json.dumps(sample, ensure_ascii=False) + '\n')
330
+
331
+ print(f"✅ 已导出 {len(samples)} 条样本为 {format_type} 格式: {output_file}")
332
+ return len(samples)
333
+
334
+
335
+ # ============================================================================
336
+ # 主函数
337
+ # ============================================================================
338
+
339
+ if __name__ == "__main__":
340
+ import argparse
341
+
342
+ parser = argparse.ArgumentParser(description="SFT 数据清洗与导出工具")
343
+ parser.add_argument("--input", "-i", default="evaluation/sft_data/eval_results.jsonl",
344
+ help="输入文件路径")
345
+ parser.add_argument("--min-score", "-s", type=float, default=0.7,
346
+ help="最低质量分数 (默认: 0.7)")
347
+ parser.add_argument("--format", "-f", choices=["alpaca", "sharegpt", "messages"],
348
+ default="alpaca", help="导出格式 (默认: alpaca)")
349
+ parser.add_argument("--export", "-e", action="store_true",
350
+ help="同时导出为训练格式")
351
+
352
+ args = parser.parse_args()
353
+
354
+ # 配置
355
+ config = CleaningConfig()
356
+ config.MIN_OVERALL_SCORE = args.min_score
357
+
358
+ # 清洗
359
+ stats = clean_and_export(args.input, config)
360
+
361
+ # 导出为训练格式
362
+ if args.export and stats["passed"] > 0:
363
+ # 找到最新的清洗文件
364
+ output_dir = Path(config.OUTPUT_DIR)
365
+ cleaned_files = sorted(output_dir.glob("sft_train_*.jsonl"), reverse=True)
366
+ if cleaned_files:
367
+ latest_file = cleaned_files[0]
368
+ export_file = output_dir / f"train_{args.format}.jsonl"
369
+ export_for_training(str(latest_file), str(export_file), args.format)
evaluation/data_router.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: evaluation/data_router.py
2
+ """
3
+ 数据路由引擎 - 负责 SFT 数据管理和路由
4
+
5
+ 根据评估结果将样本路由到不同的数据集
6
+ """
7
+
8
+ import json
9
+ import os
10
+ from typing import Dict, List, Any
11
+
12
+ from evaluation.models import EvaluationResult, DataQualityTier
13
+ from evaluation.utils import smart_truncate, SFTLengthConfig
14
+
15
+
16
+ class DataRoutingEngine:
17
+ """评估驱动的数据路由引擎"""
18
+
19
+ # SFT 训练提示词
20
+ SFT_INSTRUCTION = (
21
+ "你是一个专业的GitHub代码仓库分析助手。根据提供的代码上下文,"
22
+ "准确回答用户关于代码实现、架构设计、功能逻辑等问题。"
23
+ "回答时应该:1) 直接引用相关代码 2) 解释代码的工作原理 3) 如有必要,提供代码示例。"
24
+ )
25
+
26
+ def __init__(self, output_dir: str = "evaluation/sft_data"):
27
+ self.output_dir = output_dir
28
+ os.makedirs(output_dir, exist_ok=True)
29
+
30
+ self.positive_samples_file = os.path.join(output_dir, "positive_samples.jsonl")
31
+ self.negative_samples_file = os.path.join(output_dir, "negative_samples.jsonl")
32
+ self.dpo_pairs_file = os.path.join(output_dir, "dpo_pairs.jsonl")
33
+ self.eval_results_file = os.path.join(output_dir, "eval_results.jsonl")
34
+
35
+ def route_sample(self, eval_result: EvaluationResult) -> str:
36
+ """路由单个样本,返回数据质量等级"""
37
+ if eval_result.overall_score == 0.0:
38
+ eval_result.compute_overall_score()
39
+
40
+ self.route_data(eval_result)
41
+ return eval_result.data_quality_tier.value
42
+
43
+ def route_data(self, eval_result: EvaluationResult) -> None:
44
+ """
45
+ 根据评估结果路由数据
46
+
47
+ 路由规则:
48
+ - score > 0.9 → Gold → positive_samples.jsonl
49
+ - score > 0.6 → Silver → positive_samples.jsonl
50
+ - score > 0.4 → Bronze → negative_samples.jsonl
51
+ - score <= 0.4 → Rejected (不应到达此处,在 auto_eval 中已过滤)
52
+
53
+ 注意: eval_results.jsonl 记录所有通过验证的样本,用于分析和审计
54
+ """
55
+ # 记录所有评估结果(完整审计日志)
56
+ self._append_jsonl(self.eval_results_file, eval_result.to_dict())
57
+
58
+ # 根据质量分级路由到不同的 SFT 数据文件
59
+ if eval_result.overall_score > 0.9:
60
+ # Gold: 高质量正样本
61
+ sft_sample = self._build_sft_sample(eval_result)
62
+ self._append_jsonl(self.positive_samples_file, sft_sample)
63
+
64
+ elif eval_result.overall_score > 0.6:
65
+ # Silver: 可用正样本
66
+ sft_sample = self._build_sft_sample(eval_result)
67
+ self._append_jsonl(self.positive_samples_file, sft_sample)
68
+
69
+ elif eval_result.overall_score > 0.4:
70
+ # Bronze: 负样本,可用于 DPO 或人工修正
71
+ sft_sample = self._build_sft_sample(eval_result, negative=True)
72
+ self._append_jsonl(self.negative_samples_file, sft_sample)
73
+
74
+ # <= 0.4: 不写入任何 SFT 文件(已在 auto_eval 中被拒绝)
75
+
76
+ def _build_sft_sample(self, eval_result: EvaluationResult, negative: bool = False) -> Dict:
77
+ """
78
+ 构建 SFT 训练样本
79
+
80
+ 长度限制(基于 SFTLengthConfig):
81
+ - Context: 最大 2500 字符 (~800 tokens)
82
+ - Answer: 最大 3000 字符 (~1000 tokens)
83
+ - 总计: ~2000 tokens,适合 4096 max_length 训练
84
+ """
85
+ if eval_result.generation_metrics is None:
86
+ return {}
87
+
88
+ cfg = SFTLengthConfig
89
+
90
+ # 1. 截断 Query
91
+ query = eval_result.query
92
+ if len(query) > cfg.MAX_QUERY_CHARS:
93
+ query = query[:cfg.MAX_QUERY_CHARS] + "..."
94
+
95
+ # 2. 智能截断 Context(保留开头 70% + 结尾 30%)
96
+ context = eval_result.generation_metrics.retrieved_context
97
+ context = smart_truncate(context, cfg.MAX_CONTEXT_CHARS, keep_ratio=0.7)
98
+
99
+ # 3. 截断 Answer(保留开头,通常结论在开头)
100
+ answer = eval_result.generation_metrics.generated_answer
101
+ if len(answer) > cfg.MAX_ANSWER_CHARS:
102
+ answer = answer[:cfg.MAX_ANSWER_CHARS] + "\n\n... [回答过长,已截断]"
103
+
104
+ # 4. 构建 input 并检查总长度
105
+ input_text = f"[用户问题]\n{query}\n\n[代码上下文]\n{context}"
106
+
107
+ # 如果总长度仍超限,进一步压缩 context
108
+ total_len = len(self.SFT_INSTRUCTION) + len(input_text) + len(answer)
109
+ if total_len > cfg.MAX_TOTAL_CHARS:
110
+ excess = total_len - cfg.MAX_TOTAL_CHARS
111
+ new_context_len = max(500, len(context) - excess) # 至少保留 500 字符
112
+ context = smart_truncate(
113
+ eval_result.generation_metrics.retrieved_context,
114
+ new_context_len,
115
+ keep_ratio=0.7
116
+ )
117
+ input_text = f"[用户问题]\n{query}\n\n[代码上下文]\n{context}"
118
+
119
+ return {
120
+ "instruction": self.SFT_INSTRUCTION,
121
+ "input": input_text,
122
+ "output": answer,
123
+ "metadata": {
124
+ "query": eval_result.query[:200], # metadata 中也截断,节省空间
125
+ "repo_url": eval_result.repo_url,
126
+ "language": eval_result.language,
127
+ "session_id": eval_result.session_id,
128
+ "timestamp": eval_result.timestamp.isoformat(),
129
+ "quality_tier": eval_result.data_quality_tier.value,
130
+ "overall_score": eval_result.overall_score,
131
+ "faithfulness": eval_result.generation_metrics.faithfulness,
132
+ "answer_relevance": eval_result.generation_metrics.answer_relevance,
133
+ "answer_completeness": eval_result.generation_metrics.answer_completeness,
134
+ "code_correctness": eval_result.generation_metrics.code_correctness,
135
+ "is_negative": negative,
136
+ "sft_ready": eval_result.sft_ready,
137
+ # 记录原始长度,便于分析
138
+ "original_context_len": len(eval_result.generation_metrics.retrieved_context),
139
+ "original_answer_len": len(eval_result.generation_metrics.generated_answer),
140
+ "truncated": len(eval_result.generation_metrics.retrieved_context) > cfg.MAX_CONTEXT_CHARS
141
+ or len(eval_result.generation_metrics.generated_answer) > cfg.MAX_ANSWER_CHARS,
142
+ }
143
+ }
144
+
145
+ def _append_jsonl(self, filepath: str, data: Dict) -> None:
146
+ """追加数据到 JSONL 文件"""
147
+ with open(filepath, 'a', encoding='utf-8') as f:
148
+ f.write(json.dumps(data, ensure_ascii=False) + '\n')
149
+
150
+ def get_statistics(self) -> Dict[str, int]:
151
+ """获取当前数据统计"""
152
+ stats = {}
153
+ for name, filepath in [
154
+ ("positive", self.positive_samples_file),
155
+ ("negative", self.negative_samples_file),
156
+ ("dpo_pairs", self.dpo_pairs_file),
157
+ ]:
158
+ if os.path.exists(filepath):
159
+ with open(filepath, 'r', encoding='utf-8') as f:
160
+ stats[name] = sum(1 for _ in f)
161
+ else:
162
+ stats[name] = 0
163
+ return stats
164
+
165
+ def get_distribution(self) -> Dict[str, int]:
166
+ """获取评估结果的质量分布"""
167
+ distribution = {"gold": 0, "silver": 0, "bronze": 0, "rejected": 0, "corrected": 0}
168
+
169
+ if not os.path.exists(self.eval_results_file):
170
+ return distribution
171
+
172
+ try:
173
+ with open(self.eval_results_file, 'r', encoding='utf-8') as f:
174
+ for line in f:
175
+ try:
176
+ result = json.loads(line)
177
+ tier = result.get("data_quality_tier", "bronze")
178
+ if tier in distribution:
179
+ distribution[tier] += 1
180
+ except json.JSONDecodeError:
181
+ continue
182
+ except Exception as e:
183
+ print(f"⚠️ Error reading eval results: {e}")
184
+
185
+ return distribution
186
+
187
+ def get_bad_samples(self, limit: int = 10) -> List[Dict[str, Any]]:
188
+ """获取低质量样本用于人工审核"""
189
+ bad_samples = []
190
+
191
+ if not os.path.exists(self.eval_results_file):
192
+ return bad_samples
193
+
194
+ try:
195
+ with open(self.eval_results_file, 'r', encoding='utf-8') as f:
196
+ for line in f:
197
+ try:
198
+ result = json.loads(line)
199
+ if result.get("overall_score", 0) < 0.5:
200
+ sample = {
201
+ "query": result.get("query", ""),
202
+ "score": result.get("overall_score", 0),
203
+ "issue": result.get("error_message", "Low quality"),
204
+ "quality_tier": result.get("data_quality_tier", "rejected"),
205
+ "timestamp": result.get("timestamp", "")
206
+ }
207
+ if result.get("generation"):
208
+ gen = result["generation"]
209
+ sample.update({
210
+ "faithfulness": gen.get("faithfulness", 0),
211
+ "answer_relevance": gen.get("answer_relevance", 0),
212
+ "answer_completeness": gen.get("answer_completeness", 0),
213
+ })
214
+ bad_samples.append(sample)
215
+ if len(bad_samples) >= limit:
216
+ break
217
+ except json.JSONDecodeError:
218
+ continue
219
+ except Exception as e:
220
+ print(f"⚠️ Error reading bad samples: {e}")
221
+
222
+ return sorted(bad_samples, key=lambda x: x["score"])[:limit]
evaluation/evaluation_framework.py ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: evaluation/evaluation_framework.py
2
+ """
3
+ GitHub Agent 完整评估框架
4
+ 四层评估架构 + 数据路由引擎
5
+
6
+ Author: Dexter
7
+ Date: 2025-01-27
8
+
9
+ 注意: 数据模型已拆分到 models.py,数据路由已拆分到 data_router.py
10
+ 此文件保留核心评估引擎逻辑,并重新导出所有符号保持向后兼容
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import re
16
+ from typing import List, Dict, Any
17
+ from datetime import datetime
18
+
19
+ # 重新导出所有模型(保持向后兼容)
20
+ from evaluation.models import (
21
+ EvaluationLayer,
22
+ DataQualityTier,
23
+ QueryRewriteMetrics,
24
+ RetrievalMetrics,
25
+ GenerationMetrics,
26
+ AgenticMetrics,
27
+ EvaluationResult,
28
+ )
29
+ from evaluation.data_router import DataRoutingEngine
30
+
31
+
32
+ # ============================================================================
33
+ # 评估引擎核心逻辑
34
+ # ============================================================================
35
+
36
+ class EvaluationEngine:
37
+ """评估引擎 - 负责多层面打分"""
38
+
39
+ def __init__(
40
+ self,
41
+ llm_client=None,
42
+ golden_dataset_path: str = "evaluation/golden_dataset.json",
43
+ model_name: str = None
44
+ ):
45
+ self.llm_client = llm_client
46
+ self.model_name = model_name or "gpt-4o-mini" # 默认使用轻量模型
47
+ self.golden_dataset = self._load_golden_dataset(golden_dataset_path)
48
+
49
+ def _load_golden_dataset(self, path: str) -> List[Dict]:
50
+ """加载黄金数据集"""
51
+ if not os.path.exists(path):
52
+ print(f"⚠️ Golden dataset not found at {path}")
53
+ return []
54
+
55
+ with open(path, 'r', encoding='utf-8') as f:
56
+ return json.load(f)
57
+
58
+ async def evaluate_query_rewrite(
59
+ self,
60
+ original_query: str,
61
+ rewritten_query: str,
62
+ language_detected: str
63
+ ) -> QueryRewriteMetrics:
64
+ """
65
+ 评估查询重写质量
66
+
67
+ 指标:
68
+ - keyword_coverage: 重写后的关键词是否覆盖了原Query的核心概念?
69
+ - semantic_preservation: 语义是否保留?
70
+ - diversity_score: 关键词多样性
71
+ """
72
+
73
+ # 简化版: 使用关键词匹配
74
+ original_tokens = set(original_query.lower().split())
75
+ rewritten_tokens = set(rewritten_query.lower().split())
76
+
77
+ # 关键词覆盖度: 原Query的关键词有多少在重写中保留
78
+ if original_tokens:
79
+ coverage = len(original_tokens & rewritten_tokens) / len(original_tokens)
80
+ else:
81
+ coverage = 0.0
82
+
83
+ # 多样性: 重写后的关键词数量越多、越不重复,分数越高
84
+ unique_ratio = len(rewritten_tokens) / max(len(original_tokens), 1)
85
+ diversity = min(1.0, unique_ratio)
86
+
87
+ # 语义保留度 (简化版本: 假设如果覆盖度高就认为语义保留良好)
88
+ semantic_preservation = min(1.0, coverage + 0.2) # 基础分+覆盖度加分
89
+
90
+ return QueryRewriteMetrics(
91
+ original_query=original_query,
92
+ rewritten_query=rewritten_query,
93
+ language_detected=language_detected,
94
+ keyword_coverage=coverage,
95
+ semantic_preservation=semantic_preservation,
96
+ diversity_score=diversity
97
+ )
98
+
99
+ async def evaluate_retrieval(
100
+ self,
101
+ query: str,
102
+ retrieved_files: List[str],
103
+ ground_truth_files: List[str],
104
+ top_k: int = 5,
105
+ retrieval_latency_ms: float = 0,
106
+ vector_scores: List[float] = None,
107
+ bm25_scores: List[float] = None
108
+ ) -> RetrievalMetrics:
109
+ """
110
+ 评估检索层质量
111
+
112
+ 指标:
113
+ - hit_rate: 是否找到了任何正确的文件?
114
+ - recall_at_k: 前K个中有多少是正确的?
115
+ - precision_at_k: 返回的文件中有多少是正确的?
116
+ - mrr: 第一个正确结果的排名倒数
117
+ """
118
+
119
+ retrieved_set = set(retrieved_files[:top_k])
120
+ ground_truth_set = set(ground_truth_files)
121
+
122
+ # Hit rate: 是否有交集
123
+ hit_rate = 1.0 if retrieved_set & ground_truth_set else 0.0
124
+
125
+ # Recall@K: 找到的正确结果数 / 正确结果总数
126
+ correct_count = len(retrieved_set & ground_truth_set)
127
+ recall = correct_count / len(ground_truth_set) if ground_truth_set else 0.0
128
+
129
+ # Precision@K: 找到的正确结果数 / 返回的结果总数
130
+ precision = correct_count / len(retrieved_set) if retrieved_set else 0.0
131
+
132
+ # MRR: 第一个正确结果的倒数排名
133
+ mrr = 0.0
134
+ for i, file in enumerate(retrieved_files[:top_k], 1):
135
+ if file in ground_truth_set:
136
+ mrr = 1.0 / i
137
+ break
138
+
139
+ # Context Relevance: 简化版 - 假设Precision反映了相关性
140
+ context_relevance = precision
141
+
142
+ # Chunk Integrity: 简化版 - 假设没有太多文件就认为完���度高
143
+ chunk_integrity = min(1.0, 1.0 / len(retrieved_set)) if retrieved_set else 0.0
144
+
145
+ vector_avg = sum(vector_scores) / len(vector_scores) if vector_scores else 0.0
146
+ bm25_avg = sum(bm25_scores) / len(bm25_scores) if bm25_scores else 0.0
147
+
148
+ return RetrievalMetrics(
149
+ query=query,
150
+ top_k=top_k,
151
+ hit_rate=hit_rate,
152
+ recall_at_k=recall,
153
+ precision_at_k=precision,
154
+ mrr=mrr,
155
+ context_relevance=context_relevance,
156
+ chunk_integrity=chunk_integrity,
157
+ retrieval_latency_ms=retrieval_latency_ms,
158
+ vector_score_avg=vector_avg,
159
+ bm25_score_avg=bm25_avg,
160
+ retrieved_files=retrieved_files,
161
+ ground_truth_files=ground_truth_files
162
+ )
163
+
164
+ async def evaluate_generation(
165
+ self,
166
+ query: str,
167
+ retrieved_context: str,
168
+ generated_answer: str,
169
+ ground_truth_answer: str = "",
170
+ generation_latency_ms: float = 0,
171
+ token_usage: Dict[str, int] = None
172
+ ) -> GenerationMetrics:
173
+ """
174
+ 评估生成层质量
175
+
176
+ 指标:
177
+ - faithfulness: 回答是否严格基于Context?
178
+ - answer_relevance: 回答是否回答了问题?
179
+ - answer_completeness: 回答是否足够完整?
180
+ - code_correctness: 生成的代码是否正确?
181
+ """
182
+
183
+ # 1. Faithfulness: 使用LLM-as-Judge进行幻觉检测
184
+ faithfulness = await self._judge_faithfulness(
185
+ retrieved_context,
186
+ generated_answer
187
+ )
188
+
189
+ # 2. Answer Relevance: 回答和问题的相似度
190
+ answer_relevance = await self._judge_answer_relevance(
191
+ query,
192
+ generated_answer
193
+ )
194
+
195
+ # 3. Answer Completeness: 简化版 - 通过长度和结构判断
196
+ completeness = self._judge_completeness(
197
+ generated_answer,
198
+ ground_truth_answer
199
+ )
200
+
201
+ # 4. Code Correctness: 使用AST检查代码块
202
+ code_samples = self._extract_code_blocks(generated_answer)
203
+ code_correctness = self._check_code_correctness(code_samples)
204
+
205
+ metrics = GenerationMetrics(
206
+ query=query,
207
+ retrieved_context=retrieved_context,
208
+ generated_answer=generated_answer,
209
+ ground_truth_answer=ground_truth_answer,
210
+ faithfulness=faithfulness,
211
+ answer_relevance=answer_relevance,
212
+ answer_completeness=completeness,
213
+ code_correctness=code_correctness,
214
+ generated_code_samples=code_samples,
215
+ generation_latency_ms=generation_latency_ms,
216
+ token_usage=token_usage or {"input": 0, "output": 0}
217
+ )
218
+
219
+ return metrics
220
+
221
+ async def _judge_faithfulness(self, context: str, answer: str) -> float:
222
+ """
223
+ LLM-as-Judge: 判断回答是否由Context支撑
224
+ 返回 0-1 的分数
225
+
226
+ 注意:Faithfulness 判断的是"回答中的信息是否能从 Context 中找到依据"
227
+ 而不是"回答是否完全复制 Context 内容"
228
+ """
229
+ if not self.llm_client:
230
+ # 简化版: 如果没有LLM客户端,使用启发式方法
231
+ # 统计Answer中的关键词有多少出现在Context中
232
+ context_lower = context.lower()
233
+ answer_words = set(answer.lower().split())
234
+ # 过滤掉常见停用词
235
+ stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
236
+ 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
237
+ 'would', 'could', 'should', 'may', 'might', 'must', 'shall',
238
+ 'can', 'need', 'dare', 'ought', 'used', 'to', 'of', 'in',
239
+ 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'that',
240
+ 'which', 'who', 'whom', 'this', 'these', 'those', 'it', 'its'}
241
+ meaningful_words = answer_words - stop_words
242
+ if not meaningful_words:
243
+ return 0.7 # 没有有意义的词,给默认分
244
+ # 计算答案中有多少有意义的词出现在Context中
245
+ found_count = sum(1 for word in meaningful_words if word in context_lower)
246
+ overlap = found_count / len(meaningful_words)
247
+ return min(1.0, overlap + 0.2) # 给一定的基础分
248
+
249
+ # 智能截取 Context:提取与 Answer 相关的部分
250
+ # 如果 Context 太长,优先包含 Answer 中提到的关键词附近的内容
251
+ max_context_len = 6000 # 增加到 6000 字符
252
+ if len(context) > max_context_len:
253
+ # 尝试找到 Answer 中提到的关键文件/函数名
254
+ import re
255
+ # 提取 Answer 中可能的文件路径或函数名
256
+ patterns = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*', answer[:500])
257
+ important_terms = [p for p in patterns if len(p) > 3][:5] # 取前5个重要词
258
+
259
+ # 优先截取包含这些词的部分
260
+ context_parts = []
261
+ remaining = max_context_len
262
+ for term in important_terms:
263
+ idx = context.find(term)
264
+ if idx != -1 and remaining > 0:
265
+ start = max(0, idx - 300)
266
+ end = min(len(context), idx + 700)
267
+ snippet = context[start:end]
268
+ if snippet not in ''.join(context_parts):
269
+ context_parts.append(snippet)
270
+ remaining -= len(snippet)
271
+
272
+ # 如果没找到相关部分,还是用前 6000 字符
273
+ if context_parts:
274
+ truncated_context = "\n...\n".join(context_parts)
275
+ else:
276
+ truncated_context = context[:max_context_len]
277
+ else:
278
+ truncated_context = context
279
+
280
+ # 改进的 Prompt:更明确定义 Faithfulness
281
+ prompt = f"""Evaluate the FAITHFULNESS of the answer to the given context.
282
+
283
+ FAITHFULNESS means: The claims and information in the answer can be verified from or are consistent with the context.
284
+ - Score HIGH (0.7-1.0) if the answer correctly identifies or explains concepts that ARE in the context
285
+ - Score MEDIUM (0.4-0.7) if the answer is partially supported but makes some unsupported claims
286
+ - Score LOW (0.0-0.4) if the answer contradicts the context or makes completely unsupported claims
287
+
288
+ NOTE: If the answer says "X is not in the context" and X is indeed not shown, that's a FAITHFUL statement (score 0.7+)
289
+ NOTE: If the answer correctly identifies WHERE something is defined based on imports/references in context, that's FAITHFUL
290
+
291
+ [Context]
292
+ {truncated_context}
293
+
294
+ [Answer]
295
+ {answer[:1500]}
296
+
297
+ SCORE (0.0-1.0):"""
298
+
299
+ try:
300
+ response = await self.llm_client.chat.completions.create(
301
+ model=self.model_name,
302
+ messages=[{"role": "user", "content": prompt}],
303
+ temperature=0.1,
304
+ max_tokens=10
305
+ )
306
+ score_str = response.choices[0].message.content.strip()
307
+ # 提取数字(处理可能的额外文本)
308
+ import re
309
+ match = re.search(r'(\d+\.?\d*)', score_str)
310
+ if match:
311
+ score = float(match.group(1))
312
+ else:
313
+ score = float(score_str)
314
+ return min(1.0, max(0.0, score))
315
+ except Exception as e:
316
+ print(f"⚠️ Faithfulness judgment failed: {e}")
317
+ return 0.5
318
+
319
+ async def _judge_answer_relevance(self, query: str, answer: str) -> float:
320
+ """判断回答与问题的相关性"""
321
+ if not self.llm_client:
322
+ # 简化版: 使用关键词重叠度
323
+ query_words = set(query.lower().split())
324
+ answer_words = set(answer.lower().split())
325
+ overlap = len(query_words & answer_words) / max(len(query_words), 1)
326
+ return min(1.0, overlap + 0.3) # 基础分0.3+重叠度
327
+
328
+ prompt = f"""
329
+ Does the answer address the query?
330
+
331
+ [Query]
332
+ {query}
333
+
334
+ [Answer]
335
+ {answer[:1000]}
336
+
337
+ Score (0.0-1.0):
338
+ """
339
+
340
+ try:
341
+ response = await self.llm_client.chat.completions.create(
342
+ model=self.model_name,
343
+ messages=[{"role": "user", "content": prompt}],
344
+ temperature=0.1,
345
+ max_tokens=10
346
+ )
347
+ score = float(response.choices[0].message.content.strip())
348
+ return min(1.0, max(0.0, score))
349
+ except:
350
+ return 0.5
351
+
352
+ def _judge_completeness(self, generated_answer: str, ground_truth: str = "") -> float:
353
+ """判断回答的完整性"""
354
+ # 简化版: 根据长度和结构
355
+ if len(generated_answer) < 50:
356
+ return 0.3
357
+ elif len(generated_answer) < 200:
358
+ return 0.6
359
+ else:
360
+ return 0.9
361
+
362
+ def _extract_code_blocks(self, text: str) -> List[str]:
363
+ """从文本中提取代码块"""
364
+ import re
365
+ code_pattern = r'```[\w]*\n(.*?)\n```'
366
+ matches = re.findall(code_pattern, text, re.DOTALL)
367
+ return matches
368
+
369
+ def _check_code_correctness(self, code_samples: List[str]) -> float:
370
+ """检查代码是否有语法错误"""
371
+ if not code_samples:
372
+ return 1.0 # 没有代码就认为正确
373
+
374
+ import ast
375
+ correct_count = 0
376
+ for code in code_samples:
377
+ try:
378
+ ast.parse(code)
379
+ correct_count += 1
380
+ except SyntaxError:
381
+ pass
382
+
383
+ return correct_count / len(code_samples)
384
+
385
+ async def evaluate_agentic(
386
+ self,
387
+ query: str,
388
+ tool_calls: List[Dict[str, Any]],
389
+ success: bool,
390
+ steps_taken: int = 0,
391
+ end_to_end_latency_ms: float = 0
392
+ ) -> AgenticMetrics:
393
+ """
394
+ 评估Agent的决策和行为
395
+ """
396
+
397
+ # Tool Selection Accuracy: 工具选择是否正确?
398
+ tool_selection_accuracy = 1.0 if success else 0.5
399
+
400
+ # Tool Parameter Correctness: 参数是否正确传递?
401
+ tool_param_correctness = 1.0 if all(
402
+ tc.get("success", False) for tc in tool_calls
403
+ ) else 0.5
404
+
405
+ # 计算冗余步骤
406
+ unnecessary_steps = 0
407
+ backtrack_count = 0
408
+
409
+ # 简化版: 如果有重复的工具调用则视为冗余
410
+ tool_call_signatures = [tc.get("name", "") for tc in tool_calls]
411
+ for i, sig in enumerate(tool_call_signatures):
412
+ if i > 0 and sig == tool_call_signatures[i-1]:
413
+ unnecessary_steps += 1
414
+
415
+ return AgenticMetrics(
416
+ query=query,
417
+ tool_calls=tool_calls,
418
+ tool_selection_accuracy=tool_selection_accuracy,
419
+ tool_parameter_correctness=tool_param_correctness,
420
+ steps_taken=steps_taken,
421
+ unnecessary_steps=unnecessary_steps,
422
+ backtrack_count=backtrack_count,
423
+ success=success,
424
+ end_to_end_latency_ms=end_to_end_latency_ms
425
+ )
426
+
427
+ def get_statistics(self) -> Dict[str, Any]:
428
+ """
429
+ 获取评估统计信息
430
+
431
+ Returns:
432
+ 包含 total_evaluations, average_score, quality_distribution, top_issues 的字典
433
+ """
434
+ # 从 eval_results.jsonl 读取评估结果
435
+ eval_results_path = "evaluation/sft_data/eval_results.jsonl"
436
+
437
+ stats = {
438
+ "total_evaluations": 0,
439
+ "average_score": 0.0,
440
+ "quality_distribution": {
441
+ "gold": 0,
442
+ "silver": 0,
443
+ "bronze": 0,
444
+ "rejected": 0
445
+ },
446
+ "top_issues": []
447
+ }
448
+
449
+ if not os.path.exists(eval_results_path):
450
+ return stats
451
+
452
+ # 读取和分析评估结果
453
+ scores = []
454
+ issues = {}
455
+
456
+ try:
457
+ with open(eval_results_path, 'r', encoding='utf-8') as f:
458
+ for line in f:
459
+ try:
460
+ result = json.loads(line)
461
+ stats["total_evaluations"] += 1
462
+
463
+ # 收集得分
464
+ score = result.get("overall_score", 0)
465
+ scores.append(score)
466
+
467
+ # 统计质量分布
468
+ tier = result.get("data_quality_tier", "bronze")
469
+ if tier in stats["quality_distribution"]:
470
+ stats["quality_distribution"][tier] += 1
471
+
472
+ # 收集常见问题 (假设记录在 notes 或 error_message 中)
473
+ note = result.get("notes", "") or result.get("error_message", "")
474
+ if note:
475
+ issues[note] = issues.get(note, 0) + 1
476
+ except json.JSONDecodeError:
477
+ continue
478
+ except Exception as e:
479
+ print(f"⚠️ Error reading eval results: {e}")
480
+
481
+ # 计算平均分
482
+ if scores:
483
+ stats["average_score"] = sum(scores) / len(scores)
484
+
485
+ # 获取前5个常见问题
486
+ if issues:
487
+ stats["top_issues"] = [
488
+ {"issue": issue, "count": count}
489
+ for issue, count in sorted(issues.items(), key=lambda x: x[1], reverse=True)[:5]
490
+ ]
491
+
492
+ return stats
493
+
494
+
495
+ # ============================================================================
496
+ # __all__ 导出列表(保持向后兼容)
497
+ # ============================================================================
498
+
499
+ __all__ = [
500
+ # 枚举
501
+ "EvaluationLayer",
502
+ "DataQualityTier",
503
+ # 数据模型
504
+ "QueryRewriteMetrics",
505
+ "RetrievalMetrics",
506
+ "GenerationMetrics",
507
+ "AgenticMetrics",
508
+ "EvaluationResult",
509
+ # 引擎
510
+ "EvaluationEngine",
511
+ "DataRoutingEngine",
512
+ ]
evaluation/golden_dataset_builder.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: evaluation/golden_dataset_builder.py
2
+ """
3
+ 黄金数据集构建工具
4
+ 用于快速构建评估所需的标注数据集
5
+
6
+ 使用场景:
7
+ 1. 初始化: 为新项目快速创建 50 条测试用例
8
+ 2. 扩展: 定期添加新的问题和标注
9
+ 3. 验证: 自动验证数据集的完整性
10
+
11
+ Author: Dexter
12
+ Date: 2025-01-27
13
+ """
14
+
15
+ import json
16
+ import os
17
+ from typing import List, Dict, Optional
18
+ from dataclasses import dataclass, asdict
19
+ from datetime import datetime
20
+
21
+
22
+ @dataclass
23
+ class GoldenSample:
24
+ """黄金数据集样本"""
25
+ id: str # 唯一ID
26
+ description: str # 问题描述 (用于标注人员理解问题类型)
27
+ query: str # 用户查询
28
+ expected_files: List[str] # 标准答案: 应该返回的文件列表
29
+ expected_answer: str = "" # 标准答案: 预期回答 (可选)
30
+ difficulty: str = "medium" # 难度: easy/medium/hard
31
+ category: str = "general" # 类别: general/code_finding/architecture/workflow
32
+ language: str = "en" # 语言: en/zh
33
+ created_at: str = ""
34
+
35
+ def __post_init__(self):
36
+ if not self.created_at:
37
+ self.created_at = datetime.now().isoformat()
38
+
39
+
40
+ class GoldenDatasetBuilder:
41
+ """黄金数据集构建器"""
42
+
43
+ def __init__(self, filepath: str = "evaluation/golden_dataset.json"):
44
+ self.filepath = filepath
45
+ self.samples: List[GoldenSample] = []
46
+ self.load()
47
+
48
+ def load(self):
49
+ """加载现有数据集"""
50
+ if os.path.exists(self.filepath):
51
+ with open(self.filepath, 'r', encoding='utf-8') as f:
52
+ try:
53
+ raw_data = json.load(f)
54
+ # 兼容旧格式 (直接是字典列表)
55
+ if isinstance(raw_data, list):
56
+ self.samples = [
57
+ GoldenSample(**item) if isinstance(item, dict) and "id" in item
58
+ else GoldenSample(
59
+ id=str(len(self.samples)),
60
+ description=item.get("description", ""),
61
+ query=item.get("query", ""),
62
+ expected_files=[item.get("answer_file", "")] if item.get("answer_file") else []
63
+ )
64
+ for item in raw_data
65
+ ]
66
+ except:
67
+ self.samples = []
68
+
69
+ def save(self):
70
+ """保存数据集"""
71
+ os.makedirs(os.path.dirname(self.filepath), exist_ok=True)
72
+ data = [asdict(s) for s in self.samples]
73
+ with open(self.filepath, 'w', encoding='utf-8') as f:
74
+ json.dump(data, f, ensure_ascii=False, indent=2)
75
+
76
+ def add_sample(self, sample: GoldenSample):
77
+ """添加样本"""
78
+ sample.id = f"sample_{len(self.samples):04d}"
79
+ self.samples.append(sample)
80
+
81
+ def add_samples_batch(self, samples: List[GoldenSample]):
82
+ """批量添加样本"""
83
+ for sample in samples:
84
+ self.add_sample(sample)
85
+
86
+ def get_samples_by_category(self, category: str) -> List[GoldenSample]:
87
+ """按类别筛选"""
88
+ return [s for s in self.samples if s.category == category]
89
+
90
+ def get_samples_by_difficulty(self, difficulty: str) -> List[GoldenSample]:
91
+ """按难度筛选"""
92
+ return [s for s in self.samples if s.difficulty == difficulty]
93
+
94
+ def get_statistics(self) -> Dict:
95
+ """获取统计信息"""
96
+ stats = {
97
+ "total": len(self.samples),
98
+ "by_category": {},
99
+ "by_difficulty": {},
100
+ "by_language": {}
101
+ }
102
+
103
+ for s in self.samples:
104
+ stats["by_category"][s.category] = stats["by_category"].get(s.category, 0) + 1
105
+ stats["by_difficulty"][s.difficulty] = stats["by_difficulty"].get(s.difficulty, 0) + 1
106
+ stats["by_language"][s.language] = stats["by_language"].get(s.language, 0) + 1
107
+
108
+ return stats
109
+
110
+
111
+ # ============================================================================
112
+ # 预定义的通用问题模板
113
+ # ============================================================================
114
+
115
+ # 针对 FastAPI 项目的初始数据集 (参考你现有的 golden_dataset.json)
116
+ FASTAPI_GOLDEN_SAMPLES = [
117
+ # Easy: 代码位置查找
118
+ GoldenSample(
119
+ id="",
120
+ description="简单函数查找",
121
+ query="Where is the 'serialize_response' function?",
122
+ expected_files=["fastapi/routing.py"],
123
+ difficulty="easy",
124
+ category="code_finding"
125
+ ),
126
+
127
+ # Medium: 理解数据流
128
+ GoldenSample(
129
+ id="",
130
+ description="理解核心模块职责",
131
+ query="How does dependency injection work in FastAPI?",
132
+ expected_files=["fastapi/dependencies/utils.py", "fastapi/depends.py"],
133
+ difficulty="medium",
134
+ category="architecture"
135
+ ),
136
+
137
+ # Hard: 跨文件理解工作流
138
+ GoldenSample(
139
+ id="",
140
+ description="完整工作流理解",
141
+ query="Show me the complete flow from request to response in FastAPI",
142
+ expected_files=["fastapi/routing.py", "fastapi/applications.py", "fastapi/dependencies/utils.py"],
143
+ difficulty="hard",
144
+ category="workflow"
145
+ ),
146
+ ]
147
+
148
+ # GitHub Agent 项目的初始数据集
149
+ GITHUB_AGENT_GOLDEN_SAMPLES = [
150
+ GoldenSample(
151
+ id="",
152
+ description="检索核心逻辑",
153
+ query="How is chunk_file method implemented?",
154
+ expected_files=["app/services/chunking_service.py"],
155
+ expected_answer="The chunk_file method is implemented in chunking_service.py. It takes content and file_path as parameters and uses AST parsing for Python files to intelligently chunk the code.",
156
+ difficulty="easy",
157
+ category="code_finding",
158
+ language="en"
159
+ ),
160
+
161
+ GoldenSample(
162
+ id="",
163
+ description="向量搜索机制",
164
+ query="What vector database is used for retrieval?",
165
+ expected_files=["app/services/vector_service.py"],
166
+ difficulty="medium",
167
+ category="architecture",
168
+ language="en"
169
+ ),
170
+
171
+ GoldenSample(
172
+ id="",
173
+ description="完整分析流程",
174
+ query="How does the agent analyze a GitHub repository?",
175
+ expected_files=["app/services/agent_service.py", "app/services/chunking_service.py", "app/services/vector_service.py"],
176
+ difficulty="hard",
177
+ category="workflow",
178
+ language="en"
179
+ ),
180
+ ]
181
+
182
+
183
+ # ============================================================================
184
+ # 交互式数据集构建工具
185
+ # ============================================================================
186
+
187
+ def interactive_builder():
188
+ """交互式构建黄金数据集"""
189
+ builder = GoldenDatasetBuilder()
190
+
191
+ print("=" * 60)
192
+ print("🛠️ 黄金数据集构建工具")
193
+ print("=" * 60)
194
+
195
+ while True:
196
+ print("\n请选择操作:")
197
+ print("1. 添加新样本")
198
+ print("2. 查看现有样本")
199
+ print("3. 按类别筛选")
200
+ print("4. 统计信息")
201
+ print("5. 保存并退出")
202
+ print("0. 退出(不保存)")
203
+
204
+ choice = input("请输入选项 (0-5): ").strip()
205
+
206
+ if choice == "1":
207
+ sample = GoldenSample(
208
+ id="",
209
+ description=input("📝 描述 (问题类型): "),
210
+ query=input("❓ 查询/问题: "),
211
+ expected_files=input("📁 预期文件 (逗号分隔): ").split(","),
212
+ expected_answer=input("📄 标准答案 (可选): "),
213
+ difficulty=input("⭐ 难度 (easy/medium/hard) [medium]: ") or "medium",
214
+ category=input("🏷️ 类别 (code_finding/architecture/workflow/general) [general]: ") or "general",
215
+ language=input("🌍 语言 (en/zh) [en]: ") or "en"
216
+ )
217
+ builder.add_sample(sample)
218
+ print("✅ 样本已添加")
219
+
220
+ elif choice == "2":
221
+ print(f"\n总共 {len(builder.samples)} 个样本:")
222
+ for s in builder.samples[-10:]: # 显示最后10个
223
+ print(f" - [{s.difficulty}] {s.query[:50]}")
224
+
225
+ elif choice == "3":
226
+ category = input("输入类别: ")
227
+ samples = builder.get_samples_by_category(category)
228
+ print(f"\n找到 {len(samples)} 个 '{category}' 类别的样本:")
229
+ for s in samples:
230
+ print(f" - {s.query}")
231
+
232
+ elif choice == "4":
233
+ stats = builder.get_statistics()
234
+ print(f"\n📊 数据集统计:")
235
+ print(f" 总样本数: {stats['total']}")
236
+ print(f" 按类别: {stats['by_category']}")
237
+ print(f" 按难度: {stats['by_difficulty']}")
238
+ print(f" 按语言: {stats['by_language']}")
239
+
240
+ elif choice == "5":
241
+ builder.save()
242
+ print("✅ 数据集已保存")
243
+ break
244
+
245
+ elif choice == "0":
246
+ print("⚠️ 未保存,退出")
247
+ break
248
+
249
+
250
+ # ============================================================================
251
+ # 自动评估数据集的完整性
252
+ # ============================================================================
253
+
254
+ def validate_golden_dataset(filepath: str = "evaluation/golden_dataset.json") -> Dict:
255
+ """验证黄金数据集的完整性"""
256
+
257
+ builder = GoldenDatasetBuilder(filepath)
258
+ issues = {
259
+ "missing_fields": [],
260
+ "empty_queries": [],
261
+ "empty_files": [],
262
+ "duplicates": []
263
+ }
264
+
265
+ seen_queries = set()
266
+
267
+ for i, sample in enumerate(builder.samples):
268
+ # 检查必填字段
269
+ if not sample.query:
270
+ issues["empty_queries"].append(f"Sample {i}: query is empty")
271
+
272
+ if not sample.expected_files or all(not f for f in sample.expected_files):
273
+ issues["empty_files"].append(f"Sample {i}: expected_files is empty")
274
+
275
+ # 检查重复
276
+ if sample.query in seen_queries:
277
+ issues["duplicates"].append(f"Sample {i}: duplicate query")
278
+ seen_queries.add(sample.query)
279
+
280
+ return {
281
+ "valid": len(issues) == 0 or not any(issues.values()),
282
+ "total_samples": len(builder.samples),
283
+ "issues": issues,
284
+ "stats": builder.get_statistics()
285
+ }
286
+
287
+
288
+ # ============================================================================
289
+ # 快速初始化脚本
290
+ # ============================================================================
291
+
292
+ def init_github_agent_dataset():
293
+ """快速初始化 GitHub Agent 项目的数据集"""
294
+ builder = GoldenDatasetBuilder("evaluation/golden_dataset.json")
295
+
296
+ # 清空现有 (可选)
297
+ # builder.samples = []
298
+
299
+ # 添加初始样本
300
+ builder.add_samples_batch(GITHUB_AGENT_GOLDEN_SAMPLES)
301
+
302
+ # 额外添加更多样本 (扩展到30+)
303
+ extra_samples = [
304
+ GoldenSample(
305
+ id="",
306
+ description="向量检索质量",
307
+ query="What retrieval metrics are tracked?",
308
+ expected_files=["evaluation/evaluation_framework.py"],
309
+ difficulty="medium",
310
+ category="architecture"
311
+ ),
312
+ GoldenSample(
313
+ id="",
314
+ description="Agent决策过程",
315
+ query="How does the agent decide which files to read?",
316
+ expected_files=["app/services/agent_service.py"],
317
+ difficulty="hard",
318
+ category="workflow"
319
+ ),
320
+ GoldenSample(
321
+ id="",
322
+ description="错误处理",
323
+ query="Where are network timeout errors handled?",
324
+ expected_files=["app/services/agent_service.py", "app/services/chat_service.py"],
325
+ difficulty="medium",
326
+ category="code_finding"
327
+ ),
328
+ ]
329
+ builder.add_samples_batch(extra_samples)
330
+ builder.save()
331
+
332
+ print(f"✅ 初始化完成: {len(builder.samples)} 个样本")
333
+ print(f"📊 {builder.get_statistics()}")
334
+
335
+
336
+ # ============================================================================
337
+ # 导出为 Ragas 格式
338
+ # ============================================================================
339
+
340
+ def export_to_ragas_format(golden_filepath: str, output_filepath: str = "evaluation/ragas_eval_dataset.json"):
341
+ """
342
+ 将黄金数据集导出为 Ragas 评估框架所需的格式
343
+
344
+ Ragas 格式:
345
+ {
346
+ "questions": [...],
347
+ "contexts": [...],
348
+ "ground_truths": [...]
349
+ }
350
+ """
351
+ builder = GoldenDatasetBuilder(golden_filepath)
352
+
353
+ ragas_data = {
354
+ "questions": [],
355
+ "contexts": [],
356
+ "ground_truths": [],
357
+ "metadata": []
358
+ }
359
+
360
+ for sample in builder.samples:
361
+ ragas_data["questions"].append(sample.query)
362
+ ragas_data["ground_truths"].append({
363
+ "answer": sample.expected_answer,
364
+ "files": sample.expected_files
365
+ })
366
+ ragas_data["contexts"].append("\n".join(sample.expected_files))
367
+ ragas_data["metadata"].append({
368
+ "difficulty": sample.difficulty,
369
+ "category": sample.category,
370
+ "description": sample.description
371
+ })
372
+
373
+ os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
374
+ with open(output_filepath, 'w', encoding='utf-8') as f:
375
+ json.dump(ragas_data, f, ensure_ascii=False, indent=2)
376
+
377
+ print(f"✅ Exported to {output_filepath}")
378
+ print(f" Questions: {len(ragas_data['questions'])}")
379
+
380
+
381
+ # ============================================================================
382
+ # 命令行接口
383
+ # ============================================================================
384
+
385
+ if __name__ == "__main__":
386
+ import sys
387
+
388
+ if len(sys.argv) > 1:
389
+ command = sys.argv[1]
390
+
391
+ if command == "init":
392
+ init_github_agent_dataset()
393
+
394
+ elif command == "validate":
395
+ result = validate_golden_dataset()
396
+ print(json.dumps(result, indent=2, ensure_ascii=False))
397
+
398
+ elif command == "export-ragas":
399
+ export_to_ragas_format("evaluation/golden_dataset.json")
400
+
401
+ elif command == "interactive":
402
+ interactive_builder()
403
+
404
+ else:
405
+ print(f"Unknown command: {command}")
406
+
407
+ else:
408
+ print("黄金数据集构建工具")
409
+ print()
410
+ print("用法:")
411
+ print(" python golden_dataset_builder.py init # 快速初始化")
412
+ print(" python golden_dataset_builder.py validate # 验证数据集")
413
+ print(" python golden_dataset_builder.py export-ragas # 导出为Ragas格式")
414
+ print(" python golden_dataset_builder.py interactive # 交互式构建")
evaluation/models.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: evaluation/models.py
2
+ """
3
+ 评估数据模型定义
4
+
5
+ 将所有数据类和枚举集中管理,保持代码职责清晰
6
+ """
7
+
8
+ from dataclasses import dataclass, field, asdict
9
+ from typing import List, Dict, Optional, Any
10
+ from enum import Enum
11
+ from datetime import datetime
12
+
13
+
14
+ class EvaluationLayer(Enum):
15
+ """评估层次分类"""
16
+ QUERY_REWRITE = "query_rewrite"
17
+ RETRIEVAL = "retrieval"
18
+ GENERATION = "generation"
19
+ AGENTIC = "agentic"
20
+
21
+
22
+ class DataQualityTier(Enum):
23
+ """数据质量分级 (用于SFT数据路由)"""
24
+ GOLD = "gold" # 完美样本 (score > 0.9)
25
+ SILVER = "silver" # 优质样本 (score 0.7-0.9)
26
+ BRONZE = "bronze" # 可用样本 (score 0.5-0.7)
27
+ REJECTED = "rejected" # 拒绝 (score < 0.5)
28
+ CORRECTED = "corrected" # 自纠正后的样本 (用于DPO)
29
+
30
+
31
+ # ============================================================================
32
+ # 各层评估指标
33
+ # ============================================================================
34
+
35
+ @dataclass
36
+ class QueryRewriteMetrics:
37
+ """查询重写评估指标"""
38
+ original_query: str
39
+ rewritten_query: str
40
+ language_detected: str
41
+ keyword_coverage: float # 0-1
42
+ semantic_preservation: float # 0-1
43
+ diversity_score: float # 0-1
44
+
45
+ def overall_score(self) -> float:
46
+ return (
47
+ self.keyword_coverage * 0.4 +
48
+ self.semantic_preservation * 0.4 +
49
+ self.diversity_score * 0.2
50
+ )
51
+
52
+
53
+ @dataclass
54
+ class RetrievalMetrics:
55
+ """检索层评估指标"""
56
+ query: str
57
+ top_k: int
58
+
59
+ # 核心指标
60
+ hit_rate: float
61
+ recall_at_k: float
62
+ precision_at_k: float
63
+ mrr: float # Mean Reciprocal Rank
64
+
65
+ # 高级指标
66
+ context_relevance: float
67
+ chunk_integrity: float
68
+ retrieval_latency_ms: float
69
+
70
+ # 混合检索
71
+ vector_score_avg: float
72
+ bm25_score_avg: float
73
+
74
+ retrieved_files: List[str] = field(default_factory=list)
75
+ ground_truth_files: List[str] = field(default_factory=list)
76
+
77
+ def overall_score(self) -> float:
78
+ return (
79
+ self.recall_at_k * 0.3 +
80
+ self.precision_at_k * 0.3 +
81
+ self.context_relevance * 0.25 +
82
+ self.chunk_integrity * 0.15
83
+ )
84
+
85
+
86
+ @dataclass
87
+ class GenerationMetrics:
88
+ """生成层评估指标"""
89
+ query: str
90
+ retrieved_context: str
91
+ generated_answer: str
92
+
93
+ # 核心指标
94
+ faithfulness: float
95
+ answer_relevance: float
96
+ answer_completeness: float
97
+ code_correctness: float
98
+
99
+ # 可选
100
+ ground_truth_answer: str = ""
101
+ hallucination_count: int = 0
102
+ unsupported_claims: List[str] = field(default_factory=list)
103
+ generated_code_samples: List[str] = field(default_factory=list)
104
+ generation_latency_ms: float = 0
105
+ token_usage: Dict[str, int] = field(default_factory=lambda: {"input": 0, "output": 0})
106
+
107
+ def overall_score(self) -> float:
108
+ base_score = (
109
+ self.faithfulness * 0.35 +
110
+ self.answer_relevance * 0.35 +
111
+ self.answer_completeness * 0.2 +
112
+ self.code_correctness * 0.1
113
+ )
114
+ penalty = self.hallucination_count * 0.1
115
+ return max(0, base_score - penalty)
116
+
117
+
118
+ @dataclass
119
+ class AgenticMetrics:
120
+ """Agent行为评估指标"""
121
+ query: str
122
+ tool_selection_accuracy: float
123
+ tool_parameter_correctness: float
124
+
125
+ tool_calls: List[Dict[str, Any]] = field(default_factory=list)
126
+ steps_taken: int = 0
127
+ unnecessary_steps: int = 0
128
+ backtrack_count: int = 0
129
+ success: bool = True
130
+ early_termination: bool = False
131
+ end_to_end_latency_ms: float = 0
132
+
133
+ def efficiency_score(self) -> float:
134
+ if self.steps_taken == 0:
135
+ return 0
136
+ redundancy_ratio = self.unnecessary_steps / self.steps_taken
137
+ return max(0, min(1, 1 - redundancy_ratio - self.backtrack_count * 0.1))
138
+
139
+ def overall_score(self) -> float:
140
+ return (
141
+ self.tool_selection_accuracy * 0.4 +
142
+ self.tool_parameter_correctness * 0.3 +
143
+ self.efficiency_score() * 0.2 +
144
+ (1.0 if self.success else 0.0) * 0.1
145
+ )
146
+
147
+
148
+ # ============================================================================
149
+ # 综合评估结果
150
+ # ============================================================================
151
+
152
+ @dataclass
153
+ class EvaluationResult:
154
+ """单次评估完整结果"""
155
+ session_id: str
156
+ query: str
157
+ repo_url: str
158
+ timestamp: datetime
159
+ language: str = "en"
160
+
161
+ # 各层评估结果
162
+ query_rewrite_metrics: Optional[QueryRewriteMetrics] = None
163
+ retrieval_metrics: Optional[RetrievalMetrics] = None
164
+ generation_metrics: Optional[GenerationMetrics] = None
165
+ agentic_metrics: Optional[AgenticMetrics] = None
166
+
167
+ # 综合评分
168
+ overall_score: float = 0.0
169
+ data_quality_tier: DataQualityTier = DataQualityTier.BRONZE
170
+
171
+ # SFT标注
172
+ sft_ready: bool = False
173
+ dpo_candidate: bool = False
174
+
175
+ # 元数据
176
+ error_message: Optional[str] = None
177
+ notes: str = ""
178
+
179
+ def compute_overall_score(self) -> float:
180
+ """计算加权综合得分"""
181
+ scores, weights = [], []
182
+
183
+ if self.query_rewrite_metrics:
184
+ scores.append(self.query_rewrite_metrics.overall_score())
185
+ weights.append(0.15)
186
+
187
+ if self.retrieval_metrics:
188
+ scores.append(self.retrieval_metrics.overall_score())
189
+ weights.append(0.35)
190
+
191
+ if self.generation_metrics:
192
+ scores.append(self.generation_metrics.overall_score())
193
+ weights.append(0.4)
194
+
195
+ if self.agentic_metrics:
196
+ scores.append(self.agentic_metrics.overall_score())
197
+ weights.append(0.1)
198
+
199
+ if not scores:
200
+ return 0.0
201
+
202
+ total_weight = sum(weights)
203
+ self.overall_score = sum(s * w for s, w in zip(scores, weights)) / total_weight
204
+
205
+ # 分级
206
+ if self.overall_score > 0.9:
207
+ self.data_quality_tier = DataQualityTier.GOLD
208
+ self.sft_ready = True
209
+ elif self.overall_score > 0.7:
210
+ self.data_quality_tier = DataQualityTier.SILVER
211
+ self.sft_ready = True
212
+ elif self.overall_score > 0.5:
213
+ self.data_quality_tier = DataQualityTier.BRONZE
214
+ else:
215
+ self.data_quality_tier = DataQualityTier.REJECTED
216
+
217
+ return self.overall_score
218
+
219
+ def to_dict(self) -> Dict:
220
+ """转换为字典供存储"""
221
+ result = {
222
+ "session_id": self.session_id,
223
+ "query": self.query,
224
+ "repo_url": self.repo_url,
225
+ "timestamp": self.timestamp.isoformat(),
226
+ "language": self.language,
227
+ "overall_score": self.overall_score,
228
+ "data_quality_tier": self.data_quality_tier.value,
229
+ "sft_ready": self.sft_ready,
230
+ "dpo_candidate": self.dpo_candidate,
231
+ "error_message": self.error_message,
232
+ "notes": self.notes,
233
+ }
234
+
235
+ if self.query_rewrite_metrics:
236
+ result["query_rewrite"] = asdict(self.query_rewrite_metrics)
237
+ if self.retrieval_metrics:
238
+ result["retrieval"] = asdict(self.retrieval_metrics)
239
+ if self.generation_metrics:
240
+ result["generation"] = asdict(self.generation_metrics)
241
+ if self.agentic_metrics:
242
+ result["agentic"] = asdict(self.agentic_metrics)
243
+
244
+ return result
evaluation/test_retrieval.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 检索系统离线评估脚本
4
+
5
+ 用于测试 chunking 和检索策略的准确率。
6
+ 使用 golden_dataset.json 中的标注数据作为 ground truth。
7
+
8
+ 使用方法:
9
+ python evaluation/test_retrieval.py --repo https://github.com/tiangolo/fastapi
10
+ python evaluation/test_retrieval.py --repo https://github.com/tiangolo/fastapi --top-k 5
11
+ python evaluation/test_retrieval.py --repo https://github.com/tiangolo/fastapi --verbose
12
+
13
+ Author: Dexter
14
+ Date: 2026-01-28
15
+ """
16
+
17
+ import json
18
+ import os
19
+ import sys
20
+ import asyncio
21
+ import argparse
22
+ from typing import List, Dict, Tuple
23
+ from dataclasses import dataclass, field
24
+ from datetime import datetime
25
+
26
+ # 添加项目根目录到 path
27
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
28
+
29
+ from app.services.vector_service import store_manager
30
+ from app.services.github_service import get_repo_structure
31
+
32
+
33
+ @dataclass
34
+ class RetrievalTestResult:
35
+ """单个测试用例的结果"""
36
+ query: str
37
+ expected_files: List[str]
38
+ retrieved_files: List[str]
39
+ hit: bool # 是否命中任意一个预期文件
40
+ recall: float # 召回率: 命中的预期文件 / 总预期文件
41
+ precision: float # 精确率: 命中的预期文件 / 检索结果数
42
+ reciprocal_rank: float # 倒数排名: 1 / 第一个命中的位置
43
+ difficulty: str = ""
44
+ category: str = ""
45
+
46
+
47
+ @dataclass
48
+ class EvaluationReport:
49
+ """完整评估报告"""
50
+ repo_url: str
51
+ top_k: int
52
+ total_queries: int
53
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
54
+
55
+ # 聚合指标
56
+ hit_rate: float = 0.0 # 命中率: 至少命中一个的查询比例
57
+ mean_recall: float = 0.0 # 平均召回率
58
+ mean_precision: float = 0.0 # 平均精确率
59
+ mrr: float = 0.0 # Mean Reciprocal Rank
60
+
61
+ # 按难度分组
62
+ by_difficulty: Dict[str, Dict] = field(default_factory=dict)
63
+
64
+ # 详细结果
65
+ results: List[RetrievalTestResult] = field(default_factory=list)
66
+ failed_cases: List[Dict] = field(default_factory=list)
67
+
68
+
69
+ class RetrievalEvaluator:
70
+ """检索系统评估器"""
71
+
72
+ def __init__(self, golden_dataset_path: str = "evaluation/golden_dataset.json"):
73
+ self.golden_dataset = self._load_golden_dataset(golden_dataset_path)
74
+ print(f"📊 Loaded {len(self.golden_dataset)} test cases from golden dataset")
75
+
76
+ def _load_golden_dataset(self, path: str) -> List[Dict]:
77
+ """加载黄金数据集"""
78
+ if not os.path.exists(path):
79
+ raise FileNotFoundError(f"Golden dataset not found: {path}")
80
+
81
+ with open(path, 'r', encoding='utf-8') as f:
82
+ return json.load(f)
83
+
84
+ async def evaluate(
85
+ self,
86
+ repo_url: str,
87
+ session_id: str = "eval_test",
88
+ top_k: int = 5,
89
+ verbose: bool = False
90
+ ) -> EvaluationReport:
91
+ """
92
+ 运行完整的检索评估
93
+
94
+ Args:
95
+ repo_url: 要评估的仓库 URL
96
+ session_id: 会话 ID
97
+ top_k: 每次检索返回的文件数
98
+ verbose: 是否打印详细信息
99
+ """
100
+ print(f"\n{'='*60}")
101
+ print(f"🔍 Retrieval Evaluation")
102
+ print(f"{'='*60}")
103
+ print(f"Repository: {repo_url}")
104
+ print(f"Top-K: {top_k}")
105
+ print(f"Test Cases: {len(self.golden_dataset)}")
106
+ print(f"{'='*60}\n")
107
+
108
+ # 获取仓库文件列表
109
+ print("📂 Fetching repository structure...")
110
+ file_list = get_repo_structure(repo_url) # 同步函数,不需要 await
111
+ print(f" Found {len(file_list)} files")
112
+
113
+ # 获取向量存储
114
+ store = store_manager.get_store(session_id)
115
+ chunk_count = store.collection.count() # 使用 collection.count()
116
+ if chunk_count == 0:
117
+ print("\n⚠️ Vector store is empty!")
118
+ print(" Please run the agent first to index the repository.")
119
+ print(" Example: Access http://localhost:8000 and analyze the repo first.")
120
+ return None
121
+ print(f" Vector store has {chunk_count} chunks")
122
+
123
+ # 运行评估
124
+ report = EvaluationReport(
125
+ repo_url=repo_url,
126
+ top_k=top_k,
127
+ total_queries=len(self.golden_dataset)
128
+ )
129
+
130
+ hits = 0
131
+ recalls = []
132
+ precisions = []
133
+ reciprocal_ranks = []
134
+
135
+ difficulty_stats = {}
136
+
137
+ for i, sample in enumerate(self.golden_dataset):
138
+ query = sample.get("query", "")
139
+ expected_files = sample.get("expected_files", [])
140
+ difficulty = sample.get("difficulty", "medium")
141
+ category = sample.get("category", "general")
142
+
143
+ if not query or not expected_files:
144
+ continue
145
+
146
+ # 执行检索 (使用 hybrid search)
147
+ try:
148
+ results = await store.search_hybrid(query, top_k=top_k)
149
+ except Exception as e:
150
+ if verbose:
151
+ print(f" [ERR] Search failed: {e}")
152
+ continue
153
+
154
+ # 提取检索到的文件路径
155
+ retrieved_files = []
156
+ for doc in results:
157
+ if isinstance(doc, dict):
158
+ file_path = doc.get("file", "")
159
+ if file_path and file_path not in retrieved_files:
160
+ retrieved_files.append(file_path)
161
+
162
+ # 计算指标
163
+ expected_set = set(expected_files)
164
+ retrieved_set = set(retrieved_files[:top_k])
165
+
166
+ # 命中的文件
167
+ hits_set = expected_set & retrieved_set
168
+
169
+ # Hit: 是否命中任意一个
170
+ hit = len(hits_set) > 0
171
+ if hit:
172
+ hits += 1
173
+
174
+ # Recall: 命中的 / 期望的
175
+ recall = len(hits_set) / len(expected_set) if expected_set else 0
176
+ recalls.append(recall)
177
+
178
+ # Precision: 命中的 / 检索的
179
+ precision = len(hits_set) / min(len(retrieved_files), top_k) if retrieved_files else 0
180
+ precisions.append(precision)
181
+
182
+ # Reciprocal Rank: 1 / 第一个命中的位置
183
+ rr = 0.0
184
+ for rank, file in enumerate(retrieved_files[:top_k], 1):
185
+ if file in expected_set:
186
+ rr = 1.0 / rank
187
+ break
188
+ reciprocal_ranks.append(rr)
189
+
190
+ # 记录结果
191
+ result = RetrievalTestResult(
192
+ query=query,
193
+ expected_files=expected_files,
194
+ retrieved_files=retrieved_files[:top_k],
195
+ hit=hit,
196
+ recall=recall,
197
+ precision=precision,
198
+ reciprocal_rank=rr,
199
+ difficulty=difficulty,
200
+ category=category
201
+ )
202
+ report.results.append(result)
203
+
204
+ # 按难度统计
205
+ if difficulty not in difficulty_stats:
206
+ difficulty_stats[difficulty] = {"hits": 0, "total": 0, "recalls": [], "precisions": []}
207
+ difficulty_stats[difficulty]["total"] += 1
208
+ if hit:
209
+ difficulty_stats[difficulty]["hits"] += 1
210
+ difficulty_stats[difficulty]["recalls"].append(recall)
211
+ difficulty_stats[difficulty]["precisions"].append(precision)
212
+
213
+ # 记录失败案例
214
+ if not hit:
215
+ report.failed_cases.append({
216
+ "query": query,
217
+ "expected": expected_files,
218
+ "retrieved": retrieved_files[:top_k],
219
+ "difficulty": difficulty
220
+ })
221
+
222
+ # 打印进度
223
+ if verbose:
224
+ status = "✅" if hit else "❌"
225
+ print(f" [{i+1:3d}] {status} Recall={recall:.2f} | {query[:50]}...")
226
+ else:
227
+ print(f"\r Progress: {i+1}/{len(self.golden_dataset)}", end="")
228
+
229
+ print("\n")
230
+
231
+ # 计算聚合指标
232
+ report.hit_rate = hits / len(self.golden_dataset) if self.golden_dataset else 0
233
+ report.mean_recall = sum(recalls) / len(recalls) if recalls else 0
234
+ report.mean_precision = sum(precisions) / len(precisions) if precisions else 0
235
+ report.mrr = sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0
236
+
237
+ # 按难度汇总
238
+ for diff, stats in difficulty_stats.items():
239
+ report.by_difficulty[diff] = {
240
+ "hit_rate": stats["hits"] / stats["total"] if stats["total"] else 0,
241
+ "mean_recall": sum(stats["recalls"]) / len(stats["recalls"]) if stats["recalls"] else 0,
242
+ "mean_precision": sum(stats["precisions"]) / len(stats["precisions"]) if stats["precisions"] else 0,
243
+ "total": stats["total"]
244
+ }
245
+
246
+ return report
247
+
248
+ def print_report(self, report: EvaluationReport):
249
+ """打印评估报告"""
250
+ print(f"\n{'='*60}")
251
+ print(f"📊 RETRIEVAL EVALUATION REPORT")
252
+ print(f"{'='*60}")
253
+ print(f"Repository: {report.repo_url}")
254
+ print(f"Top-K: {report.top_k}")
255
+ print(f"Total Queries: {report.total_queries}")
256
+ print(f"Timestamp: {report.timestamp}")
257
+ print(f"{'='*60}\n")
258
+
259
+ print("📈 OVERALL METRICS")
260
+ print(f" Hit Rate: {report.hit_rate:.1%}")
261
+ print(f" Mean Recall: {report.mean_recall:.1%}")
262
+ print(f" Mean Precision: {report.mean_precision:.1%}")
263
+ print(f" MRR: {report.mrr:.3f}")
264
+
265
+ print(f"\n📊 BY DIFFICULTY")
266
+ for diff, stats in sorted(report.by_difficulty.items()):
267
+ print(f" {diff.upper():8s} | Hit: {stats['hit_rate']:.1%} | Recall: {stats['mean_recall']:.1%} | n={stats['total']}")
268
+
269
+ if report.failed_cases:
270
+ print(f"\n❌ FAILED CASES ({len(report.failed_cases)} total)")
271
+ for case in report.failed_cases[:5]: # 只显示前5个
272
+ print(f" Query: {case['query'][:60]}...")
273
+ print(f" Expected: {case['expected']}")
274
+ print(f" Got: {case['retrieved'][:3]}...")
275
+ print()
276
+
277
+ print(f"{'='*60}")
278
+
279
+ def save_report(self, report: EvaluationReport, output_path: str = "evaluation/retrieval_report.json"):
280
+ """保存报告到文件"""
281
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
282
+
283
+ # 转换为可序列化格式
284
+ data = {
285
+ "repo_url": report.repo_url,
286
+ "top_k": report.top_k,
287
+ "total_queries": report.total_queries,
288
+ "timestamp": report.timestamp,
289
+ "metrics": {
290
+ "hit_rate": report.hit_rate,
291
+ "mean_recall": report.mean_recall,
292
+ "mean_precision": report.mean_precision,
293
+ "mrr": report.mrr
294
+ },
295
+ "by_difficulty": report.by_difficulty,
296
+ "failed_cases": report.failed_cases
297
+ }
298
+
299
+ with open(output_path, 'w', encoding='utf-8') as f:
300
+ json.dump(data, f, ensure_ascii=False, indent=2)
301
+
302
+ print(f"\n💾 Report saved to: {output_path}")
303
+
304
+
305
+ async def main():
306
+ parser = argparse.ArgumentParser(description="Evaluate retrieval system using golden dataset")
307
+ parser.add_argument("--repo", required=True, help="GitHub repository URL to evaluate")
308
+ parser.add_argument("--top-k", type=int, default=5, help="Number of results to retrieve (default: 5)")
309
+ parser.add_argument("--session", default="eval_test", help="Session ID for vector store")
310
+ parser.add_argument("--verbose", "-v", action="store_true", help="Print detailed results")
311
+ parser.add_argument("--save", action="store_true", help="Save report to file")
312
+
313
+ args = parser.parse_args()
314
+
315
+ evaluator = RetrievalEvaluator()
316
+ report = await evaluator.evaluate(
317
+ repo_url=args.repo,
318
+ session_id=args.session,
319
+ top_k=args.top_k,
320
+ verbose=args.verbose
321
+ )
322
+
323
+ if report:
324
+ evaluator.print_report(report)
325
+ if args.save:
326
+ evaluator.save_report(report)
327
+
328
+
329
+ if __name__ == "__main__":
330
+ asyncio.run(main())
evaluation/utils.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 文件路径: evaluation/utils.py
2
+ """
3
+ 评估模块公共工具函数和常量
4
+
5
+ 将重复的逻辑抽取到这里,保持代码 DRY (Don't Repeat Yourself)
6
+ """
7
+
8
+ from typing import List
9
+
10
+
11
+ # ============================================================================
12
+ # 闲聊/无效 Query 检测
13
+ # ============================================================================
14
+
15
+ CHATTY_PATTERNS: List[str] = [
16
+ # 中文闲聊
17
+ "你好", "您好", "嗨", "在吗", "在不在", "谢谢", "多谢", "再见", "拜拜",
18
+ "什么是", "你是谁", "你叫什么", "帮帮我", "教教我",
19
+ # 英文闲聊
20
+ "hello", "hi", "hey", "thanks", "thank you", "bye", "goodbye",
21
+ "what is", "who are you", "help me", "can you",
22
+ # 单词/简短
23
+ "test", "测试", "ok", "yes", "no",
24
+ ]
25
+
26
+ # 代码语言指示符
27
+ CODE_INDICATORS: List[str] = [
28
+ # Python
29
+ "def ", "class ", "import ", "from ",
30
+ # JavaScript/TypeScript
31
+ "function ", "const ", "let ", "var ",
32
+ # Java/C#
33
+ "public ", "private ", "void ",
34
+ # Go
35
+ "func ", "package ",
36
+ # 通用
37
+ "```", # Markdown 代码块
38
+ ]
39
+
40
+
41
+ def is_chatty_query(query: str, min_length: int = 5) -> bool:
42
+ """
43
+ 检测是否为闲聊/无效 query
44
+
45
+ Args:
46
+ query: 用户查询
47
+ min_length: 最小有效长度,低于此值视为无效
48
+
49
+ Returns:
50
+ True 如果是闲聊/无效查询
51
+ """
52
+ if not query:
53
+ return True
54
+
55
+ query_lower = query.lower().strip()
56
+
57
+ # 长度检查
58
+ if len(query_lower) < min_length:
59
+ return True
60
+
61
+ # 模式匹配
62
+ for pattern in CHATTY_PATTERNS:
63
+ if query_lower == pattern or query_lower.startswith(pattern + " "):
64
+ return True
65
+
66
+ return False
67
+
68
+
69
+ def has_code_indicators(text: str) -> bool:
70
+ """
71
+ 检查文本是否包含代码指示符
72
+
73
+ Args:
74
+ text: 要检查的文本
75
+
76
+ Returns:
77
+ True 如果包含代码特征
78
+ """
79
+ if not text:
80
+ return False
81
+
82
+ for indicator in CODE_INDICATORS:
83
+ if indicator in text:
84
+ return True
85
+
86
+ return False
87
+
88
+
89
+ # ============================================================================
90
+ # 文件操作工具
91
+ # ============================================================================
92
+
93
+ def append_jsonl(filepath: str, data: dict) -> None:
94
+ """
95
+ 追加一行 JSON 到 JSONL 文件
96
+
97
+ Args:
98
+ filepath: 文件路径
99
+ data: 要追加的数据字典
100
+ """
101
+ import json
102
+ with open(filepath, 'a', encoding='utf-8') as f:
103
+ f.write(json.dumps(data, ensure_ascii=False) + '\n')
104
+
105
+
106
+ def read_jsonl(filepath: str) -> list:
107
+ """
108
+ 读取 JSONL 文件
109
+
110
+ Args:
111
+ filepath: 文件路径
112
+
113
+ Returns:
114
+ 数据列表
115
+ """
116
+ import json
117
+ import os
118
+
119
+ if not os.path.exists(filepath):
120
+ return []
121
+
122
+ results = []
123
+ with open(filepath, 'r', encoding='utf-8') as f:
124
+ for line in f:
125
+ try:
126
+ results.append(json.loads(line))
127
+ except json.JSONDecodeError:
128
+ continue
129
+ return results
130
+
131
+
132
+ def safe_truncate(text: str, max_length: int, suffix: str = "\n... [truncated]") -> str:
133
+ """
134
+ 安全截断文本
135
+
136
+ Args:
137
+ text: 原始文本
138
+ max_length: 最大长度
139
+ suffix: 截断后缀
140
+
141
+ Returns:
142
+ 截断后的文本
143
+ """
144
+ if not text or len(text) <= max_length:
145
+ return text
146
+ return text[:max_length] + suffix
147
+
148
+
149
+ def smart_truncate(text: str, max_length: int, keep_ratio: float = 0.7) -> str:
150
+ """
151
+ 智能截断:保留开头大部分 + 结尾小部分,适合代码上下文
152
+
153
+ Args:
154
+ text: 原始文本
155
+ max_length: 最大长度
156
+ keep_ratio: 开头保留比例(默认 70% 开头,30% 结尾)
157
+
158
+ Returns:
159
+ 截断后的文本,保留首尾关键内容
160
+ """
161
+ if not text or len(text) <= max_length:
162
+ return text
163
+
164
+ separator = "\n\n... [中间内容已省略] ...\n\n"
165
+ available = max_length - len(separator)
166
+
167
+ if available <= 0:
168
+ return text[:max_length]
169
+
170
+ head_len = int(available * keep_ratio)
171
+ tail_len = available - head_len
172
+
173
+ return text[:head_len] + separator + text[-tail_len:]
174
+
175
+
176
+ # ============================================================================
177
+ # SFT 数据长度配置
178
+ # ============================================================================
179
+
180
+ class SFTLengthConfig:
181
+ """SFT 训练数据长度配置"""
182
+
183
+ # Context 限制(检索到的代码上下文)
184
+ MAX_CONTEXT_CHARS = 2500 # 最大字符数 (~800 tokens)
185
+
186
+ # Answer 限制(模型生成的回答)
187
+ MAX_ANSWER_CHARS = 3000 # 最大字符数 (~1000 tokens)
188
+
189
+ # Query 限制
190
+ MAX_QUERY_CHARS = 500 # 最大字符数
191
+
192
+ # 总体限制
193
+ MAX_TOTAL_CHARS = 6000 # 总字符数上限 (~2000 tokens)
194
+
195
+ # Token 估算(中英文混合,保守估计)
196
+ CHARS_PER_TOKEN = 3 # 平均每 token 的字符数
frontend-dist/assets/Tableau10-B-NsZVaP.js ADDED
@@ -0,0 +1 @@
 
 
1
+ function o(e){for(var c=e.length/6|0,n=new Array(c),a=0;a<c;)n[a]="#"+e.slice(a*6,++a*6);return n}const r=o("4e79a7f28e2ce1575976b7b259a14fedc949af7aa1ff9da79c755fbab0ab");export{r as s};
frontend-dist/assets/arc-BscbqCCW.js ADDED
@@ -0,0 +1 @@
 
 
1
+ import{w as ln,c as I}from"./path-CbwjOpE9.js";import{av as an,aw as j,ax as D,ay as rn,az as y,V as on,aA as K,aB as _,aC as un,aD as t,aE as tn,aF as sn,aG as fn}from"./index-BCNM9-Ly.js";function cn(l){return l.innerRadius}function yn(l){return l.outerRadius}function gn(l){return l.startAngle}function mn(l){return l.endAngle}function pn(l){return l&&l.padAngle}function xn(l,h,z,E,v,A,O,a){var B=z-l,i=E-h,n=O-v,m=a-A,r=m*B-n*i;if(!(r*r<y))return r=(n*(h-A)-m*(l-v))/r,[l+r*B,h+r*i]}function W(l,h,z,E,v,A,O){var a=l-z,B=h-E,i=(O?A:-A)/K(a*a+B*B),n=i*B,m=-i*a,r=l+n,s=h+m,f=z+n,c=E+m,S=(r+f)/2,o=(s+c)/2,p=f-r,g=c-s,R=p*p+g*g,T=v-A,w=r*c-f*s,C=(g<0?-1:1)*K(tn(0,T*T*R-w*w)),F=(w*g-p*C)/R,G=(-w*p-g*C)/R,P=(w*g+p*C)/R,x=(-w*p+g*C)/R,d=F-S,e=G-o,u=P-S,V=x-o;return d*d+e*e>u*u+V*V&&(F=P,G=x),{cx:F,cy:G,x01:-n,y01:-m,x11:F*(v/T-1),y11:G*(v/T-1)}}function vn(){var l=cn,h=yn,z=I(0),E=null,v=gn,A=mn,O=pn,a=null,B=ln(i);function i(){var n,m,r=+l.apply(this,arguments),s=+h.apply(this,arguments),f=v.apply(this,arguments)-rn,c=A.apply(this,arguments)-rn,S=un(c-f),o=c>f;if(a||(a=n=B()),s<r&&(m=s,s=r,r=m),!(s>y))a.moveTo(0,0);else if(S>on-y)a.moveTo(s*j(f),s*D(f)),a.arc(0,0,s,f,c,!o),r>y&&(a.moveTo(r*j(c),r*D(c)),a.arc(0,0,r,c,f,o));else{var p=f,g=c,R=f,T=c,w=S,C=S,F=O.apply(this,arguments)/2,G=F>y&&(E?+E.apply(this,arguments):K(r*r+s*s)),P=_(un(s-r)/2,+z.apply(this,arguments)),x=P,d=P,e,u;if(G>y){var V=sn(G/r*D(F)),L=sn(G/s*D(F));(w-=V*2)>y?(V*=o?1:-1,R+=V,T-=V):(w=0,R=T=(f+c)/2),(C-=L*2)>y?(L*=o?1:-1,p+=L,g-=L):(C=0,p=g=(f+c)/2)}var H=s*j(p),J=s*D(p),M=r*j(T),N=r*D(T);if(P>y){var Q=s*j(g),U=s*D(g),X=r*j(R),Y=r*D(R),q;if(S<an)if(q=xn(H,J,X,Y,Q,U,M,N)){var Z=H-q[0],$=J-q[1],k=Q-q[0],b=U-q[1],nn=1/D(fn((Z*k+$*b)/(K(Z*Z+$*$)*K(k*k+b*b)))/2),en=K(q[0]*q[0]+q[1]*q[1]);x=_(P,(r-en)/(nn-1)),d=_(P,(s-en)/(nn+1))}else x=d=0}C>y?d>y?(e=W(X,Y,H,J,s,d,o),u=W(Q,U,M,N,s,d,o),a.moveTo(e.cx+e.x01,e.cy+e.y01),d<P?a.arc(e.cx,e.cy,d,t(e.y01,e.x01),t(u.y01,u.x01),!o):(a.arc(e.cx,e.cy,d,t(e.y01,e.x01),t(e.y11,e.x11),!o),a.arc(0,0,s,t(e.cy+e.y11,e.cx+e.x11),t(u.cy+u.y11,u.cx+u.x11),!o),a.arc(u.cx,u.cy,d,t(u.y11,u.x11),t(u.y01,u.x01),!o))):(a.moveTo(H,J),a.arc(0,0,s,p,g,!o)):a.moveTo(H,J),!(r>y)||!(w>y)?a.lineTo(M,N):x>y?(e=W(M,N,Q,U,r,-x,o),u=W(H,J,X,Y,r,-x,o),a.lineTo(e.cx+e.x01,e.cy+e.y01),x<P?a.arc(e.cx,e.cy,x,t(e.y01,e.x01),t(u.y01,u.x01),!o):(a.arc(e.cx,e.cy,x,t(e.y01,e.x01),t(e.y11,e.x11),!o),a.arc(0,0,r,t(e.cy+e.y11,e.cx+e.x11),t(u.cy+u.y11,u.cx+u.x11),o),a.arc(u.cx,u.cy,x,t(u.y11,u.x11),t(u.y01,u.x01),!o))):a.arc(0,0,r,T,R,o)}if(a.closePath(),n)return a=null,n+""||null}return i.centroid=function(){var n=(+l.apply(this,arguments)+ +h.apply(this,arguments))/2,m=(+v.apply(this,arguments)+ +A.apply(this,arguments))/2-an/2;return[j(m)*n,D(m)*n]},i.innerRadius=function(n){return arguments.length?(l=typeof n=="function"?n:I(+n),i):l},i.outerRadius=function(n){return arguments.length?(h=typeof n=="function"?n:I(+n),i):h},i.cornerRadius=function(n){return arguments.length?(z=typeof n=="function"?n:I(+n),i):z},i.padRadius=function(n){return arguments.length?(E=n==null?null:typeof n=="function"?n:I(+n),i):E},i.startAngle=function(n){return arguments.length?(v=typeof n=="function"?n:I(+n),i):v},i.endAngle=function(n){return arguments.length?(A=typeof n=="function"?n:I(+n),i):A},i.padAngle=function(n){return arguments.length?(O=typeof n=="function"?n:I(+n),i):O},i.context=function(n){return arguments.length?(a=n??null,i):a},i}export{vn as a};
frontend-dist/assets/array-BKyUJesY.js ADDED
@@ -0,0 +1 @@
 
 
1
+ function t(r){return typeof r=="object"&&"length"in r?r:Array.from(r)}export{t as a};
frontend-dist/assets/blockDiagram-c4efeb88-CL85BYG9.js ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as se,d as H,e as ye,l as S,E as Ee,B as we,k as De,c as he,p as ve}from"./index-BCNM9-Ly.js";import{c as Ne}from"./clone-C4pHamD7.js";import{i as ke,c as Ie,b as Oe,d as Te,a as ge,p as ze}from"./edges-96097737-CqpaF4BI.js";import{G as Ce}from"./graph-CY8eBbAS.js";import{o as Ae}from"./ordinal-Cboi1Yqb.js";import{c as Re}from"./channel-DsKT-zfZ.js";import{s as Be}from"./Tableau10-B-NsZVaP.js";import"./createText-1719965b-BZ0xZVnk.js";import"./line-DdWeXrJe.js";import"./array-BKyUJesY.js";import"./path-CbwjOpE9.js";import"./init-Gi6I4Gst.js";var le,oe,ee=function(){var e=function(D,o,s,i){for(s=s||{},i=D.length;i--;s[D[i]]=o);return s},a=[1,7],d=[1,13],c=[1,14],n=[1,15],g=[1,19],l=[1,16],f=[1,17],b=[1,18],p=[8,30],x=[8,21,28,29,30,31,32,40,44,47],y=[1,23],T=[1,24],v=[8,15,16,21,28,29,30,31,32,40,44,47],N=[8,15,16,21,27,28,29,30,31,32,40,44,47],E=[1,49],L={trace:function(){},yy:{},symbols_:{error:2,spaceLines:3,SPACELINE:4,NL:5,separator:6,SPACE:7,EOF:8,start:9,BLOCK_DIAGRAM_KEY:10,document:11,stop:12,statement:13,link:14,LINK:15,START_LINK:16,LINK_LABEL:17,STR:18,nodeStatement:19,columnsStatement:20,SPACE_BLOCK:21,blockStatement:22,classDefStatement:23,cssClassStatement:24,styleStatement:25,node:26,SIZE:27,COLUMNS:28,"id-block":29,end:30,block:31,NODE_ID:32,nodeShapeNLabel:33,dirList:34,DIR:35,NODE_DSTART:36,NODE_DEND:37,BLOCK_ARROW_START:38,BLOCK_ARROW_END:39,classDef:40,CLASSDEF_ID:41,CLASSDEF_STYLEOPTS:42,DEFAULT:43,class:44,CLASSENTITY_IDS:45,STYLECLASS:46,style:47,STYLE_ENTITY_IDS:48,STYLE_DEFINITION_DATA:49,$accept:0,$end:1},terminals_:{2:"error",4:"SPACELINE",5:"NL",7:"SPACE",8:"EOF",10:"BLOCK_DIAGRAM_KEY",15:"LINK",16:"START_LINK",17:"LINK_LABEL",18:"STR",21:"SPACE_BLOCK",27:"SIZE",28:"COLUMNS",29:"id-block",30:"end",31:"block",32:"NODE_ID",35:"DIR",36:"NODE_DSTART",37:"NODE_DEND",38:"BLOCK_ARROW_START",39:"BLOCK_ARROW_END",40:"classDef",41:"CLASSDEF_ID",42:"CLASSDEF_STYLEOPTS",43:"DEFAULT",44:"class",45:"CLASSENTITY_IDS",46:"STYLECLASS",47:"style",48:"STYLE_ENTITY_IDS",49:"STYLE_DEFINITION_DATA"},productions_:[0,[3,1],[3,2],[3,2],[6,1],[6,1],[6,1],[9,3],[12,1],[12,1],[12,2],[12,2],[11,1],[11,2],[14,1],[14,4],[13,1],[13,1],[13,1],[13,1],[13,1],[13,1],[13,1],[19,3],[19,2],[19,1],[20,1],[22,4],[22,3],[26,1],[26,2],[34,1],[34,2],[33,3],[33,4],[23,3],[23,3],[24,3],[25,3]],performAction:function(o,s,i,u,h,t,m){var r=t.length-1;switch(h){case 4:u.getLogger().debug("Rule: separator (NL) ");break;case 5:u.getLogger().debug("Rule: separator (Space) ");break;case 6:u.getLogger().debug("Rule: separator (EOF) ");break;case 7:u.getLogger().debug("Rule: hierarchy: ",t[r-1]),u.setHierarchy(t[r-1]);break;case 8:u.getLogger().debug("Stop NL ");break;case 9:u.getLogger().debug("Stop EOF ");break;case 10:u.getLogger().debug("Stop NL2 ");break;case 11:u.getLogger().debug("Stop EOF2 ");break;case 12:u.getLogger().debug("Rule: statement: ",t[r]),typeof t[r].length=="number"?this.$=t[r]:this.$=[t[r]];break;case 13:u.getLogger().debug("Rule: statement #2: ",t[r-1]),this.$=[t[r-1]].concat(t[r]);break;case 14:u.getLogger().debug("Rule: link: ",t[r],o),this.$={edgeTypeStr:t[r],label:""};break;case 15:u.getLogger().debug("Rule: LABEL link: ",t[r-3],t[r-1],t[r]),this.$={edgeTypeStr:t[r],label:t[r-1]};break;case 18:const R=parseInt(t[r]),Y=u.generateId();this.$={id:Y,type:"space",label:"",width:R,children:[]};break;case 23:u.getLogger().debug("Rule: (nodeStatement link node) ",t[r-2],t[r-1],t[r]," typestr: ",t[r-1].edgeTypeStr);const F=u.edgeStrToEdgeData(t[r-1].edgeTypeStr);this.$=[{id:t[r-2].id,label:t[r-2].label,type:t[r-2].type,directions:t[r-2].directions},{id:t[r-2].id+"-"+t[r].id,start:t[r-2].id,end:t[r].id,label:t[r-1].label,type:"edge",directions:t[r].directions,arrowTypeEnd:F,arrowTypeStart:"arrow_open"},{id:t[r].id,label:t[r].label,type:u.typeStr2Type(t[r].typeStr),directions:t[r].directions}];break;case 24:u.getLogger().debug("Rule: nodeStatement (abc88 node size) ",t[r-1],t[r]),this.$={id:t[r-1].id,label:t[r-1].label,type:u.typeStr2Type(t[r-1].typeStr),directions:t[r-1].directions,widthInColumns:parseInt(t[r],10)};break;case 25:u.getLogger().debug("Rule: nodeStatement (node) ",t[r]),this.$={id:t[r].id,label:t[r].label,type:u.typeStr2Type(t[r].typeStr),directions:t[r].directions,widthInColumns:1};break;case 26:u.getLogger().debug("APA123",this?this:"na"),u.getLogger().debug("COLUMNS: ",t[r]),this.$={type:"column-setting",columns:t[r]==="auto"?-1:parseInt(t[r])};break;case 27:u.getLogger().debug("Rule: id-block statement : ",t[r-2],t[r-1]),u.generateId(),this.$={...t[r-2],type:"composite",children:t[r-1]};break;case 28:u.getLogger().debug("Rule: blockStatement : ",t[r-2],t[r-1],t[r]);const C=u.generateId();this.$={id:C,type:"composite",label:"",children:t[r-1]};break;case 29:u.getLogger().debug("Rule: node (NODE_ID separator): ",t[r]),this.$={id:t[r]};break;case 30:u.getLogger().debug("Rule: node (NODE_ID nodeShapeNLabel separator): ",t[r-1],t[r]),this.$={id:t[r-1],label:t[r].label,typeStr:t[r].typeStr,directions:t[r].directions};break;case 31:u.getLogger().debug("Rule: dirList: ",t[r]),this.$=[t[r]];break;case 32:u.getLogger().debug("Rule: dirList: ",t[r-1],t[r]),this.$=[t[r-1]].concat(t[r]);break;case 33:u.getLogger().debug("Rule: nodeShapeNLabel: ",t[r-2],t[r-1],t[r]),this.$={typeStr:t[r-2]+t[r],label:t[r-1]};break;case 34:u.getLogger().debug("Rule: BLOCK_ARROW nodeShapeNLabel: ",t[r-3],t[r-2]," #3:",t[r-1],t[r]),this.$={typeStr:t[r-3]+t[r],label:t[r-2],directions:t[r-1]};break;case 35:case 36:this.$={type:"classDef",id:t[r-1].trim(),css:t[r].trim()};break;case 37:this.$={type:"applyClass",id:t[r-1].trim(),styleClass:t[r].trim()};break;case 38:this.$={type:"applyStyles",id:t[r-1].trim(),stylesStr:t[r].trim()};break}},table:[{9:1,10:[1,2]},{1:[3]},{11:3,13:4,19:5,20:6,21:a,22:8,23:9,24:10,25:11,26:12,28:d,29:c,31:n,32:g,40:l,44:f,47:b},{8:[1,20]},e(p,[2,12],{13:4,19:5,20:6,22:8,23:9,24:10,25:11,26:12,11:21,21:a,28:d,29:c,31:n,32:g,40:l,44:f,47:b}),e(x,[2,16],{14:22,15:y,16:T}),e(x,[2,17]),e(x,[2,18]),e(x,[2,19]),e(x,[2,20]),e(x,[2,21]),e(x,[2,22]),e(v,[2,25],{27:[1,25]}),e(x,[2,26]),{19:26,26:12,32:g},{11:27,13:4,19:5,20:6,21:a,22:8,23:9,24:10,25:11,26:12,28:d,29:c,31:n,32:g,40:l,44:f,47:b},{41:[1,28],43:[1,29]},{45:[1,30]},{48:[1,31]},e(N,[2,29],{33:32,36:[1,33],38:[1,34]}),{1:[2,7]},e(p,[2,13]),{26:35,32:g},{32:[2,14]},{17:[1,36]},e(v,[2,24]),{11:37,13:4,14:22,15:y,16:T,19:5,20:6,21:a,22:8,23:9,24:10,25:11,26:12,28:d,29:c,31:n,32:g,40:l,44:f,47:b},{30:[1,38]},{42:[1,39]},{42:[1,40]},{46:[1,41]},{49:[1,42]},e(N,[2,30]),{18:[1,43]},{18:[1,44]},e(v,[2,23]),{18:[1,45]},{30:[1,46]},e(x,[2,28]),e(x,[2,35]),e(x,[2,36]),e(x,[2,37]),e(x,[2,38]),{37:[1,47]},{34:48,35:E},{15:[1,50]},e(x,[2,27]),e(N,[2,33]),{39:[1,51]},{34:52,35:E,39:[2,31]},{32:[2,15]},e(N,[2,34]),{39:[2,32]}],defaultActions:{20:[2,7],23:[2,14],50:[2,15],52:[2,32]},parseError:function(o,s){if(s.recoverable)this.trace(o);else{var i=new Error(o);throw i.hash=s,i}},parse:function(o){var s=this,i=[0],u=[],h=[null],t=[],m=this.table,r="",R=0,Y=0,F=2,C=1,Le=t.slice.call(arguments,1),w=Object.create(this.lexer),K={yy:{}};for(var Z in this.yy)Object.prototype.hasOwnProperty.call(this.yy,Z)&&(K.yy[Z]=this.yy[Z]);w.setInput(o,K.yy),K.yy.lexer=w,K.yy.parser=this,typeof w.yylloc>"u"&&(w.yylloc={});var J=w.yylloc;t.push(J);var me=w.options&&w.options.ranges;typeof K.yy.parseError=="function"?this.parseError=K.yy.parseError:this.parseError=Object.getPrototypeOf(this).parseError;function _e(){var P;return P=u.pop()||w.lex()||C,typeof P!="number"&&(P instanceof Array&&(u=P,P=u.pop()),P=s.symbols_[P]||P),P}for(var I,M,z,Q,W={},X,B,ae,G;;){if(M=i[i.length-1],this.defaultActions[M]?z=this.defaultActions[M]:((I===null||typeof I>"u")&&(I=_e()),z=m[M]&&m[M][I]),typeof z>"u"||!z.length||!z[0]){var $="";G=[];for(X in m[M])this.terminals_[X]&&X>F&&G.push("'"+this.terminals_[X]+"'");w.showPosition?$="Parse error on line "+(R+1)+`:
2
+ `+w.showPosition()+`
3
+ Expecting `+G.join(", ")+", got '"+(this.terminals_[I]||I)+"'":$="Parse error on line "+(R+1)+": Unexpected "+(I==C?"end of input":"'"+(this.terminals_[I]||I)+"'"),this.parseError($,{text:w.match,token:this.terminals_[I]||I,line:w.yylineno,loc:J,expected:G})}if(z[0]instanceof Array&&z.length>1)throw new Error("Parse Error: multiple actions possible at state: "+M+", token: "+I);switch(z[0]){case 1:i.push(I),h.push(w.yytext),t.push(w.yylloc),i.push(z[1]),I=null,Y=w.yyleng,r=w.yytext,R=w.yylineno,J=w.yylloc;break;case 2:if(B=this.productions_[z[1]][1],W.$=h[h.length-B],W._$={first_line:t[t.length-(B||1)].first_line,last_line:t[t.length-1].last_line,first_column:t[t.length-(B||1)].first_column,last_column:t[t.length-1].last_column},me&&(W._$.range=[t[t.length-(B||1)].range[0],t[t.length-1].range[1]]),Q=this.performAction.apply(W,[r,Y,R,K.yy,z[1],h,t].concat(Le)),typeof Q<"u")return Q;B&&(i=i.slice(0,-1*B*2),h=h.slice(0,-1*B),t=t.slice(0,-1*B)),i.push(this.productions_[z[1]][0]),h.push(W.$),t.push(W._$),ae=m[i[i.length-2]][i[i.length-1]],i.push(ae);break;case 3:return!0}}return!0}},A=function(){var D={EOF:1,parseError:function(s,i){if(this.yy.parser)this.yy.parser.parseError(s,i);else throw new Error(s)},setInput:function(o,s){return this.yy=s||this.yy||{},this._input=o,this._more=this._backtrack=this.done=!1,this.yylineno=this.yyleng=0,this.yytext=this.matched=this.match="",this.conditionStack=["INITIAL"],this.yylloc={first_line:1,first_column:0,last_line:1,last_column:0},this.options.ranges&&(this.yylloc.range=[0,0]),this.offset=0,this},input:function(){var o=this._input[0];this.yytext+=o,this.yyleng++,this.offset++,this.match+=o,this.matched+=o;var s=o.match(/(?:\r\n?|\n).*/g);return s?(this.yylineno++,this.yylloc.last_line++):this.yylloc.last_column++,this.options.ranges&&this.yylloc.range[1]++,this._input=this._input.slice(1),o},unput:function(o){var s=o.length,i=o.split(/(?:\r\n?|\n)/g);this._input=o+this._input,this.yytext=this.yytext.substr(0,this.yytext.length-s),this.offset-=s;var u=this.match.split(/(?:\r\n?|\n)/g);this.match=this.match.substr(0,this.match.length-1),this.matched=this.matched.substr(0,this.matched.length-1),i.length-1&&(this.yylineno-=i.length-1);var h=this.yylloc.range;return this.yylloc={first_line:this.yylloc.first_line,last_line:this.yylineno+1,first_column:this.yylloc.first_column,last_column:i?(i.length===u.length?this.yylloc.first_column:0)+u[u.length-i.length].length-i[0].length:this.yylloc.first_column-s},this.options.ranges&&(this.yylloc.range=[h[0],h[0]+this.yyleng-s]),this.yyleng=this.yytext.length,this},more:function(){return this._more=!0,this},reject:function(){if(this.options.backtrack_lexer)this._backtrack=!0;else return this.parseError("Lexical error on line "+(this.yylineno+1)+`. You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).
4
+ `+this.showPosition(),{text:"",token:null,line:this.yylineno});return this},less:function(o){this.unput(this.match.slice(o))},pastInput:function(){var o=this.matched.substr(0,this.matched.length-this.match.length);return(o.length>20?"...":"")+o.substr(-20).replace(/\n/g,"")},upcomingInput:function(){var o=this.match;return o.length<20&&(o+=this._input.substr(0,20-o.length)),(o.substr(0,20)+(o.length>20?"...":"")).replace(/\n/g,"")},showPosition:function(){var o=this.pastInput(),s=new Array(o.length+1).join("-");return o+this.upcomingInput()+`
5
+ `+s+"^"},test_match:function(o,s){var i,u,h;if(this.options.backtrack_lexer&&(h={yylineno:this.yylineno,yylloc:{first_line:this.yylloc.first_line,last_line:this.last_line,first_column:this.yylloc.first_column,last_column:this.yylloc.last_column},yytext:this.yytext,match:this.match,matches:this.matches,matched:this.matched,yyleng:this.yyleng,offset:this.offset,_more:this._more,_input:this._input,yy:this.yy,conditionStack:this.conditionStack.slice(0),done:this.done},this.options.ranges&&(h.yylloc.range=this.yylloc.range.slice(0))),u=o[0].match(/(?:\r\n?|\n).*/g),u&&(this.yylineno+=u.length),this.yylloc={first_line:this.yylloc.last_line,last_line:this.yylineno+1,first_column:this.yylloc.last_column,last_column:u?u[u.length-1].length-u[u.length-1].match(/\r?\n?/)[0].length:this.yylloc.last_column+o[0].length},this.yytext+=o[0],this.match+=o[0],this.matches=o,this.yyleng=this.yytext.length,this.options.ranges&&(this.yylloc.range=[this.offset,this.offset+=this.yyleng]),this._more=!1,this._backtrack=!1,this._input=this._input.slice(o[0].length),this.matched+=o[0],i=this.performAction.call(this,this.yy,this,s,this.conditionStack[this.conditionStack.length-1]),this.done&&this._input&&(this.done=!1),i)return i;if(this._backtrack){for(var t in h)this[t]=h[t];return!1}return!1},next:function(){if(this.done)return this.EOF;this._input||(this.done=!0);var o,s,i,u;this._more||(this.yytext="",this.match="");for(var h=this._currentRules(),t=0;t<h.length;t++)if(i=this._input.match(this.rules[h[t]]),i&&(!s||i[0].length>s[0].length)){if(s=i,u=t,this.options.backtrack_lexer){if(o=this.test_match(i,h[t]),o!==!1)return o;if(this._backtrack){s=!1;continue}else return!1}else if(!this.options.flex)break}return s?(o=this.test_match(s,h[u]),o!==!1?o:!1):this._input===""?this.EOF:this.parseError("Lexical error on line "+(this.yylineno+1)+`. Unrecognized text.
6
+ `+this.showPosition(),{text:"",token:null,line:this.yylineno})},lex:function(){var s=this.next();return s||this.lex()},begin:function(s){this.conditionStack.push(s)},popState:function(){var s=this.conditionStack.length-1;return s>0?this.conditionStack.pop():this.conditionStack[0]},_currentRules:function(){return this.conditionStack.length&&this.conditionStack[this.conditionStack.length-1]?this.conditions[this.conditionStack[this.conditionStack.length-1]].rules:this.conditions.INITIAL.rules},topState:function(s){return s=this.conditionStack.length-1-Math.abs(s||0),s>=0?this.conditionStack[s]:"INITIAL"},pushState:function(s){this.begin(s)},stateStackSize:function(){return this.conditionStack.length},options:{},performAction:function(s,i,u,h){switch(u){case 0:return 10;case 1:return s.getLogger().debug("Found space-block"),31;case 2:return s.getLogger().debug("Found nl-block"),31;case 3:return s.getLogger().debug("Found space-block"),29;case 4:s.getLogger().debug(".",i.yytext);break;case 5:s.getLogger().debug("_",i.yytext);break;case 6:return 5;case 7:return i.yytext=-1,28;case 8:return i.yytext=i.yytext.replace(/columns\s+/,""),s.getLogger().debug("COLUMNS (LEX)",i.yytext),28;case 9:this.pushState("md_string");break;case 10:return"MD_STR";case 11:this.popState();break;case 12:this.pushState("string");break;case 13:s.getLogger().debug("LEX: POPPING STR:",i.yytext),this.popState();break;case 14:return s.getLogger().debug("LEX: STR end:",i.yytext),"STR";case 15:return i.yytext=i.yytext.replace(/space\:/,""),s.getLogger().debug("SPACE NUM (LEX)",i.yytext),21;case 16:return i.yytext="1",s.getLogger().debug("COLUMNS (LEX)",i.yytext),21;case 17:return 43;case 18:return"LINKSTYLE";case 19:return"INTERPOLATE";case 20:return this.pushState("CLASSDEF"),40;case 21:return this.popState(),this.pushState("CLASSDEFID"),"DEFAULT_CLASSDEF_ID";case 22:return this.popState(),this.pushState("CLASSDEFID"),41;case 23:return this.popState(),42;case 24:return this.pushState("CLASS"),44;case 25:return this.popState(),this.pushState("CLASS_STYLE"),45;case 26:return this.popState(),46;case 27:return this.pushState("STYLE_STMNT"),47;case 28:return this.popState(),this.pushState("STYLE_DEFINITION"),48;case 29:return this.popState(),49;case 30:return this.pushState("acc_title"),"acc_title";case 31:return this.popState(),"acc_title_value";case 32:return this.pushState("acc_descr"),"acc_descr";case 33:return this.popState(),"acc_descr_value";case 34:this.pushState("acc_descr_multiline");break;case 35:this.popState();break;case 36:return"acc_descr_multiline_value";case 37:return 30;case 38:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 39:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 40:return this.popState(),s.getLogger().debug("Lex: ))"),"NODE_DEND";case 41:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 42:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 43:return this.popState(),s.getLogger().debug("Lex: (-"),"NODE_DEND";case 44:return this.popState(),s.getLogger().debug("Lex: -)"),"NODE_DEND";case 45:return this.popState(),s.getLogger().debug("Lex: (("),"NODE_DEND";case 46:return this.popState(),s.getLogger().debug("Lex: ]]"),"NODE_DEND";case 47:return this.popState(),s.getLogger().debug("Lex: ("),"NODE_DEND";case 48:return this.popState(),s.getLogger().debug("Lex: ])"),"NODE_DEND";case 49:return this.popState(),s.getLogger().debug("Lex: /]"),"NODE_DEND";case 50:return this.popState(),s.getLogger().debug("Lex: /]"),"NODE_DEND";case 51:return this.popState(),s.getLogger().debug("Lex: )]"),"NODE_DEND";case 52:return this.popState(),s.getLogger().debug("Lex: )"),"NODE_DEND";case 53:return this.popState(),s.getLogger().debug("Lex: ]>"),"NODE_DEND";case 54:return this.popState(),s.getLogger().debug("Lex: ]"),"NODE_DEND";case 55:return s.getLogger().debug("Lexa: -)"),this.pushState("NODE"),36;case 56:return s.getLogger().debug("Lexa: (-"),this.pushState("NODE"),36;case 57:return s.getLogger().debug("Lexa: ))"),this.pushState("NODE"),36;case 58:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 59:return s.getLogger().debug("Lex: ((("),this.pushState("NODE"),36;case 60:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 61:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 62:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 63:return s.getLogger().debug("Lexc: >"),this.pushState("NODE"),36;case 64:return s.getLogger().debug("Lexa: (["),this.pushState("NODE"),36;case 65:return s.getLogger().debug("Lexa: )"),this.pushState("NODE"),36;case 66:return this.pushState("NODE"),36;case 67:return this.pushState("NODE"),36;case 68:return this.pushState("NODE"),36;case 69:return this.pushState("NODE"),36;case 70:return this.pushState("NODE"),36;case 71:return this.pushState("NODE"),36;case 72:return this.pushState("NODE"),36;case 73:return s.getLogger().debug("Lexa: ["),this.pushState("NODE"),36;case 74:return this.pushState("BLOCK_ARROW"),s.getLogger().debug("LEX ARR START"),38;case 75:return s.getLogger().debug("Lex: NODE_ID",i.yytext),32;case 76:return s.getLogger().debug("Lex: EOF",i.yytext),8;case 77:this.pushState("md_string");break;case 78:this.pushState("md_string");break;case 79:return"NODE_DESCR";case 80:this.popState();break;case 81:s.getLogger().debug("Lex: Starting string"),this.pushState("string");break;case 82:s.getLogger().debug("LEX ARR: Starting string"),this.pushState("string");break;case 83:return s.getLogger().debug("LEX: NODE_DESCR:",i.yytext),"NODE_DESCR";case 84:s.getLogger().debug("LEX POPPING"),this.popState();break;case 85:s.getLogger().debug("Lex: =>BAE"),this.pushState("ARROW_DIR");break;case 86:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (right): dir:",i.yytext),"DIR";case 87:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (left):",i.yytext),"DIR";case 88:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (x):",i.yytext),"DIR";case 89:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (y):",i.yytext),"DIR";case 90:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (up):",i.yytext),"DIR";case 91:return i.yytext=i.yytext.replace(/^,\s*/,""),s.getLogger().debug("Lex (down):",i.yytext),"DIR";case 92:return i.yytext="]>",s.getLogger().debug("Lex (ARROW_DIR end):",i.yytext),this.popState(),this.popState(),"BLOCK_ARROW_END";case 93:return s.getLogger().debug("Lex: LINK","#"+i.yytext+"#"),15;case 94:return s.getLogger().debug("Lex: LINK",i.yytext),15;case 95:return s.getLogger().debug("Lex: LINK",i.yytext),15;case 96:return s.getLogger().debug("Lex: LINK",i.yytext),15;case 97:return s.getLogger().debug("Lex: START_LINK",i.yytext),this.pushState("LLABEL"),16;case 98:return s.getLogger().debug("Lex: START_LINK",i.yytext),this.pushState("LLABEL"),16;case 99:return s.getLogger().debug("Lex: START_LINK",i.yytext),this.pushState("LLABEL"),16;case 100:this.pushState("md_string");break;case 101:return s.getLogger().debug("Lex: Starting string"),this.pushState("string"),"LINK_LABEL";case 102:return this.popState(),s.getLogger().debug("Lex: LINK","#"+i.yytext+"#"),15;case 103:return this.popState(),s.getLogger().debug("Lex: LINK",i.yytext),15;case 104:return this.popState(),s.getLogger().debug("Lex: LINK",i.yytext),15;case 105:return s.getLogger().debug("Lex: COLON",i.yytext),i.yytext=i.yytext.slice(1),27}},rules:[/^(?:block-beta\b)/,/^(?:block\s+)/,/^(?:block\n+)/,/^(?:block:)/,/^(?:[\s]+)/,/^(?:[\n]+)/,/^(?:((\u000D\u000A)|(\u000A)))/,/^(?:columns\s+auto\b)/,/^(?:columns\s+[\d]+)/,/^(?:["][`])/,/^(?:[^`"]+)/,/^(?:[`]["])/,/^(?:["])/,/^(?:["])/,/^(?:[^"]*)/,/^(?:space[:]\d+)/,/^(?:space\b)/,/^(?:default\b)/,/^(?:linkStyle\b)/,/^(?:interpolate\b)/,/^(?:classDef\s+)/,/^(?:DEFAULT\s+)/,/^(?:\w+\s+)/,/^(?:[^\n]*)/,/^(?:class\s+)/,/^(?:(\w+)+((,\s*\w+)*))/,/^(?:[^\n]*)/,/^(?:style\s+)/,/^(?:(\w+)+((,\s*\w+)*))/,/^(?:[^\n]*)/,/^(?:accTitle\s*:\s*)/,/^(?:(?!\n||)*[^\n]*)/,/^(?:accDescr\s*:\s*)/,/^(?:(?!\n||)*[^\n]*)/,/^(?:accDescr\s*\{\s*)/,/^(?:[\}])/,/^(?:[^\}]*)/,/^(?:end\b\s*)/,/^(?:\(\(\()/,/^(?:\)\)\))/,/^(?:[\)]\))/,/^(?:\}\})/,/^(?:\})/,/^(?:\(-)/,/^(?:-\))/,/^(?:\(\()/,/^(?:\]\])/,/^(?:\()/,/^(?:\]\))/,/^(?:\\\])/,/^(?:\/\])/,/^(?:\)\])/,/^(?:[\)])/,/^(?:\]>)/,/^(?:[\]])/,/^(?:-\))/,/^(?:\(-)/,/^(?:\)\))/,/^(?:\))/,/^(?:\(\(\()/,/^(?:\(\()/,/^(?:\{\{)/,/^(?:\{)/,/^(?:>)/,/^(?:\(\[)/,/^(?:\()/,/^(?:\[\[)/,/^(?:\[\|)/,/^(?:\[\()/,/^(?:\)\)\))/,/^(?:\[\\)/,/^(?:\[\/)/,/^(?:\[\\)/,/^(?:\[)/,/^(?:<\[)/,/^(?:[^\(\[\n\-\)\{\}\s\<\>:]+)/,/^(?:$)/,/^(?:["][`])/,/^(?:["][`])/,/^(?:[^`"]+)/,/^(?:[`]["])/,/^(?:["])/,/^(?:["])/,/^(?:[^"]+)/,/^(?:["])/,/^(?:\]>\s*\()/,/^(?:,?\s*right\s*)/,/^(?:,?\s*left\s*)/,/^(?:,?\s*x\s*)/,/^(?:,?\s*y\s*)/,/^(?:,?\s*up\s*)/,/^(?:,?\s*down\s*)/,/^(?:\)\s*)/,/^(?:\s*[xo<]?--+[-xo>]\s*)/,/^(?:\s*[xo<]?==+[=xo>]\s*)/,/^(?:\s*[xo<]?-?\.+-[xo>]?\s*)/,/^(?:\s*~~[\~]+\s*)/,/^(?:\s*[xo<]?--\s*)/,/^(?:\s*[xo<]?==\s*)/,/^(?:\s*[xo<]?-\.\s*)/,/^(?:["][`])/,/^(?:["])/,/^(?:\s*[xo<]?--+[-xo>]\s*)/,/^(?:\s*[xo<]?==+[=xo>]\s*)/,/^(?:\s*[xo<]?-?\.+-[xo>]?\s*)/,/^(?::\d+)/],conditions:{STYLE_DEFINITION:{rules:[29],inclusive:!1},STYLE_STMNT:{rules:[28],inclusive:!1},CLASSDEFID:{rules:[23],inclusive:!1},CLASSDEF:{rules:[21,22],inclusive:!1},CLASS_STYLE:{rules:[26],inclusive:!1},CLASS:{rules:[25],inclusive:!1},LLABEL:{rules:[100,101,102,103,104],inclusive:!1},ARROW_DIR:{rules:[86,87,88,89,90,91,92],inclusive:!1},BLOCK_ARROW:{rules:[77,82,85],inclusive:!1},NODE:{rules:[38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,78,81],inclusive:!1},md_string:{rules:[10,11,79,80],inclusive:!1},space:{rules:[],inclusive:!1},string:{rules:[13,14,83,84],inclusive:!1},acc_descr_multiline:{rules:[35,36],inclusive:!1},acc_descr:{rules:[33],inclusive:!1},acc_title:{rules:[31],inclusive:!1},INITIAL:{rules:[0,1,2,3,4,5,6,7,8,9,12,15,16,17,18,19,20,24,27,30,32,34,37,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,93,94,95,96,97,98,99,105],inclusive:!0}}};return D}();L.lexer=A;function k(){this.yy={}}return k.prototype=L,L.Parser=k,new k}();ee.parser=ee;const Pe=ee;let O={},ie=[],V={};const ce="color",ue="fill",Fe="bgFill",pe=",",Ke=he();let j={};const Me=e=>De.sanitizeText(e,Ke),Ye=function(e,a=""){j[e]===void 0&&(j[e]={id:e,styles:[],textStyles:[]});const d=j[e];a!=null&&a.split(pe).forEach(c=>{const n=c.replace(/([^;]*);/,"$1").trim();if(c.match(ce)){const l=n.replace(ue,Fe).replace(ce,ue);d.textStyles.push(l)}d.styles.push(n)})},We=function(e,a=""){const d=O[e];a!=null&&(d.styles=a.split(pe))},Ve=function(e,a){e.split(",").forEach(function(d){let c=O[d];if(c===void 0){const n=d.trim();O[n]={id:n,type:"na",children:[]},c=O[n]}c.classes||(c.classes=[]),c.classes.push(a)})},fe=(e,a)=>{const d=e.flat(),c=[];for(const n of d){if(n.label&&(n.label=Me(n.label)),n.type==="classDef"){Ye(n.id,n.css);continue}if(n.type==="applyClass"){Ve(n.id,(n==null?void 0:n.styleClass)||"");continue}if(n.type==="applyStyles"){n!=null&&n.stylesStr&&We(n.id,n==null?void 0:n.stylesStr);continue}if(n.type==="column-setting")a.columns=n.columns||-1;else if(n.type==="edge")V[n.id]?V[n.id]++:V[n.id]=1,n.id=V[n.id]+"-"+n.id,ie.push(n);else{n.label||(n.type==="composite"?n.label="":n.label=n.id);const g=!O[n.id];if(g?O[n.id]=n:(n.type!=="na"&&(O[n.id].type=n.type),n.label!==n.id&&(O[n.id].label=n.label)),n.children&&fe(n.children,n),n.type==="space"){const l=n.width||1;for(let f=0;f<l;f++){const b=Ne(n);b.id=b.id+"-"+f,O[b.id]=b,c.push(b)}}else g&&c.push(n)}}a.children=c};let re=[],U={id:"root",type:"composite",children:[],columns:-1};const je=()=>{S.debug("Clear called"),Ee(),U={id:"root",type:"composite",children:[],columns:-1},O={root:U},re=[],j={},ie=[],V={}};function Ue(e){switch(S.debug("typeStr2Type",e),e){case"[]":return"square";case"()":return S.debug("we have a round"),"round";case"(())":return"circle";case">]":return"rect_left_inv_arrow";case"{}":return"diamond";case"{{}}":return"hexagon";case"([])":return"stadium";case"[[]]":return"subroutine";case"[()]":return"cylinder";case"((()))":return"doublecircle";case"[//]":return"lean_right";case"[\\\\]":return"lean_left";case"[/\\]":return"trapezoid";case"[\\/]":return"inv_trapezoid";case"<[]>":return"block_arrow";default:return"na"}}function Xe(e){switch(S.debug("typeStr2Type",e),e){case"==":return"thick";default:return"normal"}}function Ge(e){switch(e.trim()){case"--x":return"arrow_cross";case"--o":return"arrow_circle";default:return"arrow_point"}}let de=0;const He=()=>(de++,"id-"+Math.random().toString(36).substr(2,12)+"-"+de),qe=e=>{U.children=e,fe(e,U),re=U.children},Ze=e=>{const a=O[e];return a?a.columns?a.columns:a.children?a.children.length:-1:-1},Je=()=>[...Object.values(O)],Qe=()=>re||[],$e=()=>ie,et=e=>O[e],tt=e=>{O[e.id]=e},st=()=>console,it=function(){return j},rt={getConfig:()=>se().block,typeStr2Type:Ue,edgeTypeStr2Type:Xe,edgeStrToEdgeData:Ge,getLogger:st,getBlocksFlat:Je,getBlocks:Qe,getEdges:$e,setHierarchy:qe,getBlock:et,setBlock:tt,getColumns:Ze,getClasses:it,clear:je,generateId:He},nt=rt,q=(e,a)=>{const d=Re,c=d(e,"r"),n=d(e,"g"),g=d(e,"b");return we(c,n,g,a)},at=e=>`.label {
7
+ font-family: ${e.fontFamily};
8
+ color: ${e.nodeTextColor||e.textColor};
9
+ }
10
+ .cluster-label text {
11
+ fill: ${e.titleColor};
12
+ }
13
+ .cluster-label span,p {
14
+ color: ${e.titleColor};
15
+ }
16
+
17
+
18
+
19
+ .label text,span,p {
20
+ fill: ${e.nodeTextColor||e.textColor};
21
+ color: ${e.nodeTextColor||e.textColor};
22
+ }
23
+
24
+ .node rect,
25
+ .node circle,
26
+ .node ellipse,
27
+ .node polygon,
28
+ .node path {
29
+ fill: ${e.mainBkg};
30
+ stroke: ${e.nodeBorder};
31
+ stroke-width: 1px;
32
+ }
33
+ .flowchart-label text {
34
+ text-anchor: middle;
35
+ }
36
+ // .flowchart-label .text-outer-tspan {
37
+ // text-anchor: middle;
38
+ // }
39
+ // .flowchart-label .text-inner-tspan {
40
+ // text-anchor: start;
41
+ // }
42
+
43
+ .node .label {
44
+ text-align: center;
45
+ }
46
+ .node.clickable {
47
+ cursor: pointer;
48
+ }
49
+
50
+ .arrowheadPath {
51
+ fill: ${e.arrowheadColor};
52
+ }
53
+
54
+ .edgePath .path {
55
+ stroke: ${e.lineColor};
56
+ stroke-width: 2.0px;
57
+ }
58
+
59
+ .flowchart-link {
60
+ stroke: ${e.lineColor};
61
+ fill: none;
62
+ }
63
+
64
+ .edgeLabel {
65
+ background-color: ${e.edgeLabelBackground};
66
+ rect {
67
+ opacity: 0.5;
68
+ background-color: ${e.edgeLabelBackground};
69
+ fill: ${e.edgeLabelBackground};
70
+ }
71
+ text-align: center;
72
+ }
73
+
74
+ /* For html labels only */
75
+ .labelBkg {
76
+ background-color: ${q(e.edgeLabelBackground,.5)};
77
+ // background-color:
78
+ }
79
+
80
+ .node .cluster {
81
+ // fill: ${q(e.mainBkg,.5)};
82
+ fill: ${q(e.clusterBkg,.5)};
83
+ stroke: ${q(e.clusterBorder,.2)};
84
+ box-shadow: rgba(50, 50, 93, 0.25) 0px 13px 27px -5px, rgba(0, 0, 0, 0.3) 0px 8px 16px -8px;
85
+ stroke-width: 1px;
86
+ }
87
+
88
+ .cluster text {
89
+ fill: ${e.titleColor};
90
+ }
91
+
92
+ .cluster span,p {
93
+ color: ${e.titleColor};
94
+ }
95
+ /* .cluster div {
96
+ color: ${e.titleColor};
97
+ } */
98
+
99
+ div.mermaidTooltip {
100
+ position: absolute;
101
+ text-align: center;
102
+ max-width: 200px;
103
+ padding: 2px;
104
+ font-family: ${e.fontFamily};
105
+ font-size: 12px;
106
+ background: ${e.tertiaryColor};
107
+ border: 1px solid ${e.border2};
108
+ border-radius: 2px;
109
+ pointer-events: none;
110
+ z-index: 100;
111
+ }
112
+
113
+ .flowchartTitleText {
114
+ text-anchor: middle;
115
+ font-size: 18px;
116
+ fill: ${e.textColor};
117
+ }
118
+ `,lt=at;function be(e,a,d=!1){var c,n,g;const l=e;let f="default";(((c=l==null?void 0:l.classes)==null?void 0:c.length)||0)>0&&(f=((l==null?void 0:l.classes)||[]).join(" ")),f=f+" flowchart-label";let b=0,p="",x;switch(l.type){case"round":b=5,p="rect";break;case"composite":b=0,p="composite",x=0;break;case"square":p="rect";break;case"diamond":p="question";break;case"hexagon":p="hexagon";break;case"block_arrow":p="block_arrow";break;case"odd":p="rect_left_inv_arrow";break;case"lean_right":p="lean_right";break;case"lean_left":p="lean_left";break;case"trapezoid":p="trapezoid";break;case"inv_trapezoid":p="inv_trapezoid";break;case"rect_left_inv_arrow":p="rect_left_inv_arrow";break;case"circle":p="circle";break;case"ellipse":p="ellipse";break;case"stadium":p="stadium";break;case"subroutine":p="subroutine";break;case"cylinder":p="cylinder";break;case"group":p="rect";break;case"doublecircle":p="doublecircle";break;default:p="rect"}const y=ve((l==null?void 0:l.styles)||[]),T=l.label,v=l.size||{width:0,height:0,x:0,y:0};return{labelStyle:y.labelStyle,shape:p,labelText:T,rx:b,ry:b,class:f,style:y.style,id:l.id,directions:l.directions,width:v.width,height:v.height,x:v.x,y:v.y,positioned:d,intersect:void 0,type:l.type,padding:x??(((g=(n=se())==null?void 0:n.block)==null?void 0:g.padding)||0)}}async function ot(e,a,d){const c=be(a,d,!1);if(c.type==="group")return;const n=await ge(e,c),g=n.node().getBBox(),l=d.getBlock(c.id);l.size={width:g.width,height:g.height,x:0,y:0,node:n},d.setBlock(l),n.remove()}async function ct(e,a,d){const c=be(a,d,!0);d.getBlock(c.id).type!=="space"&&(await ge(e,c),a.intersect=c==null?void 0:c.intersect,ze(c))}async function ne(e,a,d,c){for(const n of a)await c(e,n,d),n.children&&await ne(e,n.children,d,c)}async function ut(e,a,d){await ne(e,a,d,ot)}async function dt(e,a,d){await ne(e,a,d,ct)}async function ht(e,a,d,c,n){const g=new Ce({multigraph:!0,compound:!0});g.setGraph({rankdir:"TB",nodesep:10,ranksep:10,marginx:8,marginy:8});for(const l of d)l.size&&g.setNode(l.id,{width:l.size.width,height:l.size.height,intersect:l.intersect});for(const l of a)if(l.start&&l.end){const f=c.getBlock(l.start),b=c.getBlock(l.end);if(f!=null&&f.size&&(b!=null&&b.size)){const p=f.size,x=b.size,y=[{x:p.x,y:p.y},{x:p.x+(x.x-p.x)/2,y:p.y+(x.y-p.y)/2},{x:x.x,y:x.y}];await Ie(e,{v:l.start,w:l.end,name:l.id},{...l,arrowTypeEnd:l.arrowTypeEnd,arrowTypeStart:l.arrowTypeStart,points:y,classes:"edge-thickness-normal edge-pattern-solid flowchart-link LS-a1 LE-b1"},void 0,"block",g,n),l.label&&(await Oe(e,{...l,label:l.label,labelStyle:"stroke: #333; stroke-width: 1.5px;fill:none;",arrowTypeEnd:l.arrowTypeEnd,arrowTypeStart:l.arrowTypeStart}),await Te({...l,x:y[1].x,y:y[1].y},{originalPath:y}))}}}const _=((oe=(le=he())==null?void 0:le.block)==null?void 0:oe.padding)||8;function gt(e,a){if(e===0||!Number.isInteger(e))throw new Error("Columns must be an integer !== 0.");if(a<0||!Number.isInteger(a))throw new Error("Position must be a non-negative integer."+a);if(e<0)return{px:a,py:0};if(e===1)return{px:0,py:a};const d=a%e,c=Math.floor(a/e);return{px:d,py:c}}const pt=e=>{let a=0,d=0;for(const c of e.children){const{width:n,height:g,x:l,y:f}=c.size||{width:0,height:0,x:0,y:0};S.debug("getMaxChildSize abc95 child:",c.id,"width:",n,"height:",g,"x:",l,"y:",f,c.type),c.type!=="space"&&(n>a&&(a=n/(e.widthInColumns||1)),g>d&&(d=g))}return{width:a,height:d}};function te(e,a,d=0,c=0){var n,g,l,f,b,p,x,y,T,v,N;S.debug("setBlockSizes abc95 (start)",e.id,(n=e==null?void 0:e.size)==null?void 0:n.x,"block width =",e==null?void 0:e.size,"sieblingWidth",d),(g=e==null?void 0:e.size)!=null&&g.width||(e.size={width:d,height:c,x:0,y:0});let E=0,L=0;if(((l=e.children)==null?void 0:l.length)>0){for(const h of e.children)te(h,a);const A=pt(e);E=A.width,L=A.height,S.debug("setBlockSizes abc95 maxWidth of",e.id,":s children is ",E,L);for(const h of e.children)h.size&&(S.debug(`abc95 Setting size of children of ${e.id} id=${h.id} ${E} ${L} ${h.size}`),h.size.width=E*(h.widthInColumns||1)+_*((h.widthInColumns||1)-1),h.size.height=L,h.size.x=0,h.size.y=0,S.debug(`abc95 updating size of ${e.id} children child:${h.id} maxWidth:${E} maxHeight:${L}`));for(const h of e.children)te(h,a,E,L);const k=e.columns||-1;let D=0;for(const h of e.children)D+=h.widthInColumns||1;let o=e.children.length;k>0&&k<D&&(o=k),e.widthInColumns;const s=Math.ceil(D/o);let i=o*(E+_)+_,u=s*(L+_)+_;if(i<d){S.debug(`Detected to small siebling: abc95 ${e.id} sieblingWidth ${d} sieblingHeight ${c} width ${i}`),i=d,u=c;const h=(d-o*_-_)/o,t=(c-s*_-_)/s;S.debug("Size indata abc88",e.id,"childWidth",h,"maxWidth",E),S.debug("Size indata abc88",e.id,"childHeight",t,"maxHeight",L),S.debug("Size indata abc88 xSize",o,"padding",_);for(const m of e.children)m.size&&(m.size.width=h,m.size.height=t,m.size.x=0,m.size.y=0)}if(S.debug(`abc95 (finale calc) ${e.id} xSize ${o} ySize ${s} columns ${k}${e.children.length} width=${Math.max(i,((f=e.size)==null?void 0:f.width)||0)}`),i<(((b=e==null?void 0:e.size)==null?void 0:b.width)||0)){i=((p=e==null?void 0:e.size)==null?void 0:p.width)||0;const h=k>0?Math.min(e.children.length,k):e.children.length;if(h>0){const t=(i-h*_-_)/h;S.debug("abc95 (growing to fit) width",e.id,i,(x=e.size)==null?void 0:x.width,t);for(const m of e.children)m.size&&(m.size.width=t)}}e.size={width:i,height:u,x:0,y:0}}S.debug("setBlockSizes abc94 (done)",e.id,(y=e==null?void 0:e.size)==null?void 0:y.x,(T=e==null?void 0:e.size)==null?void 0:T.width,(v=e==null?void 0:e.size)==null?void 0:v.y,(N=e==null?void 0:e.size)==null?void 0:N.height)}function xe(e,a){var d,c,n,g,l,f,b,p,x,y,T,v,N,E,L,A,k;S.debug(`abc85 layout blocks (=>layoutBlocks) ${e.id} x: ${(d=e==null?void 0:e.size)==null?void 0:d.x} y: ${(c=e==null?void 0:e.size)==null?void 0:c.y} width: ${(n=e==null?void 0:e.size)==null?void 0:n.width}`);const D=e.columns||-1;if(S.debug("layoutBlocks columns abc95",e.id,"=>",D,e),e.children&&e.children.length>0){const o=((l=(g=e==null?void 0:e.children[0])==null?void 0:g.size)==null?void 0:l.width)||0,s=e.children.length*o+(e.children.length-1)*_;S.debug("widthOfChildren 88",s,"posX");let i=0;S.debug("abc91 block?.size?.x",e.id,(f=e==null?void 0:e.size)==null?void 0:f.x);let u=(b=e==null?void 0:e.size)!=null&&b.x?((p=e==null?void 0:e.size)==null?void 0:p.x)+(-((x=e==null?void 0:e.size)==null?void 0:x.width)/2||0):-_,h=0;for(const t of e.children){const m=e;if(!t.size)continue;const{width:r,height:R}=t.size,{px:Y,py:F}=gt(D,i);if(F!=h&&(h=F,u=(y=e==null?void 0:e.size)!=null&&y.x?((T=e==null?void 0:e.size)==null?void 0:T.x)+(-((v=e==null?void 0:e.size)==null?void 0:v.width)/2||0):-_,S.debug("New row in layout for block",e.id," and child ",t.id,h)),S.debug(`abc89 layout blocks (child) id: ${t.id} Pos: ${i} (px, py) ${Y},${F} (${(N=m==null?void 0:m.size)==null?void 0:N.x},${(E=m==null?void 0:m.size)==null?void 0:E.y}) parent: ${m.id} width: ${r}${_}`),m.size){const C=r/2;t.size.x=u+_+C,S.debug(`abc91 layout blocks (calc) px, pyid:${t.id} startingPos=X${u} new startingPosX${t.size.x} ${C} padding=${_} width=${r} halfWidth=${C} => x:${t.size.x} y:${t.size.y} ${t.widthInColumns} (width * (child?.w || 1)) / 2 ${r*((t==null?void 0:t.widthInColumns)||1)/2}`),u=t.size.x+C,t.size.y=m.size.y-m.size.height/2+F*(R+_)+R/2+_,S.debug(`abc88 layout blocks (calc) px, pyid:${t.id}startingPosX${u}${_}${C}=>x:${t.size.x}y:${t.size.y}${t.widthInColumns}(width * (child?.w || 1)) / 2${r*((t==null?void 0:t.widthInColumns)||1)/2}`)}t.children&&xe(t),i+=(t==null?void 0:t.widthInColumns)||1,S.debug("abc88 columnsPos",t,i)}}S.debug(`layout blocks (<==layoutBlocks) ${e.id} x: ${(L=e==null?void 0:e.size)==null?void 0:L.x} y: ${(A=e==null?void 0:e.size)==null?void 0:A.y} width: ${(k=e==null?void 0:e.size)==null?void 0:k.width}`)}function Se(e,{minX:a,minY:d,maxX:c,maxY:n}={minX:0,minY:0,maxX:0,maxY:0}){if(e.size&&e.id!=="root"){const{x:g,y:l,width:f,height:b}=e.size;g-f/2<a&&(a=g-f/2),l-b/2<d&&(d=l-b/2),g+f/2>c&&(c=g+f/2),l+b/2>n&&(n=l+b/2)}if(e.children)for(const g of e.children)({minX:a,minY:d,maxX:c,maxY:n}=Se(g,{minX:a,minY:d,maxX:c,maxY:n}));return{minX:a,minY:d,maxX:c,maxY:n}}function ft(e){const a=e.getBlock("root");if(!a)return;te(a,e,0,0),xe(a),S.debug("getBlocks",JSON.stringify(a,null,2));const{minX:d,minY:c,maxX:n,maxY:g}=Se(a),l=g-c,f=n-d;return{x:d,y:c,width:f,height:l}}const bt=function(e,a){return a.db.getClasses()},xt=async function(e,a,d,c){const{securityLevel:n,block:g}=se(),l=c.db;let f;n==="sandbox"&&(f=H("#i"+a));const b=n==="sandbox"?H(f.nodes()[0].contentDocument.body):H("body"),p=n==="sandbox"?b.select(`[id="${a}"]`):H(`[id="${a}"]`);ke(p,["point","circle","cross"],c.type,a);const y=l.getBlocks(),T=l.getBlocksFlat(),v=l.getEdges(),N=p.insert("g").attr("class","block");await ut(N,y,l);const E=ft(l);if(await dt(N,y,l),await ht(N,v,T,l,a),E){const L=E,A=Math.max(1,Math.round(.125*(L.width/L.height))),k=L.height+A+10,D=L.width+10,{useMaxWidth:o}=g;ye(p,k,D,!!o),S.debug("Here Bounds",E,L),p.attr("viewBox",`${L.x-5} ${L.y-5} ${L.width+10} ${L.height+10}`)}Ae(Be)},St={draw:xt,getClasses:bt},Tt={parser:Pe,db:nt,renderer:St,styles:lt};export{Tt as diagram};
frontend-dist/assets/c4Diagram-c83219d4-Dwk4T9_E.js ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import{s as we,g as Oe,a as Te,b as Re,c as Dt,d as Nt,l as le,e as De,f as Se,h as wt,i as ue,j as Pe,w as Me,k as Kt,m as oe}from"./index-BCNM9-Ly.js";import{d as Le,g as Ne}from"./svgDrawCommon-b86b1483-KNrWL8cU.js";var Yt=function(){var e=function(bt,_,x,m){for(x=x||{},m=bt.length;m--;x[bt[m]]=_);return x},t=[1,24],a=[1,25],o=[1,26],l=[1,27],i=[1,28],s=[1,63],r=[1,64],n=[1,65],h=[1,66],f=[1,67],d=[1,68],p=[1,69],E=[1,29],O=[1,30],R=[1,31],S=[1,32],L=[1,33],Y=[1,34],Q=[1,35],H=[1,36],q=[1,37],G=[1,38],K=[1,39],J=[1,40],Z=[1,41],$=[1,42],tt=[1,43],et=[1,44],it=[1,45],nt=[1,46],st=[1,47],at=[1,48],rt=[1,50],lt=[1,51],ot=[1,52],ct=[1,53],ht=[1,54],ut=[1,55],dt=[1,56],ft=[1,57],pt=[1,58],yt=[1,59],gt=[1,60],At=[14,42],Vt=[14,34,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74],Ot=[12,14,34,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74],v=[1,82],k=[1,83],A=[1,84],C=[1,85],w=[12,14,42],ne=[12,14,33,42],Pt=[12,14,33,42,76,77,79,80],mt=[12,33],zt=[34,36,37,38,39,40,41,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74],Xt={trace:function(){},yy:{},symbols_:{error:2,start:3,mermaidDoc:4,direction:5,direction_tb:6,direction_bt:7,direction_rl:8,direction_lr:9,graphConfig:10,C4_CONTEXT:11,NEWLINE:12,statements:13,EOF:14,C4_CONTAINER:15,C4_COMPONENT:16,C4_DYNAMIC:17,C4_DEPLOYMENT:18,otherStatements:19,diagramStatements:20,otherStatement:21,title:22,accDescription:23,acc_title:24,acc_title_value:25,acc_descr:26,acc_descr_value:27,acc_descr_multiline_value:28,boundaryStatement:29,boundaryStartStatement:30,boundaryStopStatement:31,boundaryStart:32,LBRACE:33,ENTERPRISE_BOUNDARY:34,attributes:35,SYSTEM_BOUNDARY:36,BOUNDARY:37,CONTAINER_BOUNDARY:38,NODE:39,NODE_L:40,NODE_R:41,RBRACE:42,diagramStatement:43,PERSON:44,PERSON_EXT:45,SYSTEM:46,SYSTEM_DB:47,SYSTEM_QUEUE:48,SYSTEM_EXT:49,SYSTEM_EXT_DB:50,SYSTEM_EXT_QUEUE:51,CONTAINER:52,CONTAINER_DB:53,CONTAINER_QUEUE:54,CONTAINER_EXT:55,CONTAINER_EXT_DB:56,CONTAINER_EXT_QUEUE:57,COMPONENT:58,COMPONENT_DB:59,COMPONENT_QUEUE:60,COMPONENT_EXT:61,COMPONENT_EXT_DB:62,COMPONENT_EXT_QUEUE:63,REL:64,BIREL:65,REL_U:66,REL_D:67,REL_L:68,REL_R:69,REL_B:70,REL_INDEX:71,UPDATE_EL_STYLE:72,UPDATE_REL_STYLE:73,UPDATE_LAYOUT_CONFIG:74,attribute:75,STR:76,STR_KEY:77,STR_VALUE:78,ATTRIBUTE:79,ATTRIBUTE_EMPTY:80,$accept:0,$end:1},terminals_:{2:"error",6:"direction_tb",7:"direction_bt",8:"direction_rl",9:"direction_lr",11:"C4_CONTEXT",12:"NEWLINE",14:"EOF",15:"C4_CONTAINER",16:"C4_COMPONENT",17:"C4_DYNAMIC",18:"C4_DEPLOYMENT",22:"title",23:"accDescription",24:"acc_title",25:"acc_title_value",26:"acc_descr",27:"acc_descr_value",28:"acc_descr_multiline_value",33:"LBRACE",34:"ENTERPRISE_BOUNDARY",36:"SYSTEM_BOUNDARY",37:"BOUNDARY",38:"CONTAINER_BOUNDARY",39:"NODE",40:"NODE_L",41:"NODE_R",42:"RBRACE",44:"PERSON",45:"PERSON_EXT",46:"SYSTEM",47:"SYSTEM_DB",48:"SYSTEM_QUEUE",49:"SYSTEM_EXT",50:"SYSTEM_EXT_DB",51:"SYSTEM_EXT_QUEUE",52:"CONTAINER",53:"CONTAINER_DB",54:"CONTAINER_QUEUE",55:"CONTAINER_EXT",56:"CONTAINER_EXT_DB",57:"CONTAINER_EXT_QUEUE",58:"COMPONENT",59:"COMPONENT_DB",60:"COMPONENT_QUEUE",61:"COMPONENT_EXT",62:"COMPONENT_EXT_DB",63:"COMPONENT_EXT_QUEUE",64:"REL",65:"BIREL",66:"REL_U",67:"REL_D",68:"REL_L",69:"REL_R",70:"REL_B",71:"REL_INDEX",72:"UPDATE_EL_STYLE",73:"UPDATE_REL_STYLE",74:"UPDATE_LAYOUT_CONFIG",76:"STR",77:"STR_KEY",78:"STR_VALUE",79:"ATTRIBUTE",80:"ATTRIBUTE_EMPTY"},productions_:[0,[3,1],[3,1],[5,1],[5,1],[5,1],[5,1],[4,1],[10,4],[10,4],[10,4],[10,4],[10,4],[13,1],[13,1],[13,2],[19,1],[19,2],[19,3],[21,1],[21,1],[21,2],[21,2],[21,1],[29,3],[30,3],[30,3],[30,4],[32,2],[32,2],[32,2],[32,2],[32,2],[32,2],[32,2],[31,1],[20,1],[20,2],[20,3],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,1],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[43,2],[35,1],[35,2],[75,1],[75,2],[75,1],[75,1]],performAction:function(_,x,m,g,T,u,Tt){var y=u.length-1;switch(T){case 3:g.setDirection("TB");break;case 4:g.setDirection("BT");break;case 5:g.setDirection("RL");break;case 6:g.setDirection("LR");break;case 8:case 9:case 10:case 11:case 12:g.setC4Type(u[y-3]);break;case 19:g.setTitle(u[y].substring(6)),this.$=u[y].substring(6);break;case 20:g.setAccDescription(u[y].substring(15)),this.$=u[y].substring(15);break;case 21:this.$=u[y].trim(),g.setTitle(this.$);break;case 22:case 23:this.$=u[y].trim(),g.setAccDescription(this.$);break;case 28:case 29:u[y].splice(2,0,"ENTERPRISE"),g.addPersonOrSystemBoundary(...u[y]),this.$=u[y];break;case 30:g.addPersonOrSystemBoundary(...u[y]),this.$=u[y];break;case 31:u[y].splice(2,0,"CONTAINER"),g.addContainerBoundary(...u[y]),this.$=u[y];break;case 32:g.addDeploymentNode("node",...u[y]),this.$=u[y];break;case 33:g.addDeploymentNode("nodeL",...u[y]),this.$=u[y];break;case 34:g.addDeploymentNode("nodeR",...u[y]),this.$=u[y];break;case 35:g.popBoundaryParseStack();break;case 39:g.addPersonOrSystem("person",...u[y]),this.$=u[y];break;case 40:g.addPersonOrSystem("external_person",...u[y]),this.$=u[y];break;case 41:g.addPersonOrSystem("system",...u[y]),this.$=u[y];break;case 42:g.addPersonOrSystem("system_db",...u[y]),this.$=u[y];break;case 43:g.addPersonOrSystem("system_queue",...u[y]),this.$=u[y];break;case 44:g.addPersonOrSystem("external_system",...u[y]),this.$=u[y];break;case 45:g.addPersonOrSystem("external_system_db",...u[y]),this.$=u[y];break;case 46:g.addPersonOrSystem("external_system_queue",...u[y]),this.$=u[y];break;case 47:g.addContainer("container",...u[y]),this.$=u[y];break;case 48:g.addContainer("container_db",...u[y]),this.$=u[y];break;case 49:g.addContainer("container_queue",...u[y]),this.$=u[y];break;case 50:g.addContainer("external_container",...u[y]),this.$=u[y];break;case 51:g.addContainer("external_container_db",...u[y]),this.$=u[y];break;case 52:g.addContainer("external_container_queue",...u[y]),this.$=u[y];break;case 53:g.addComponent("component",...u[y]),this.$=u[y];break;case 54:g.addComponent("component_db",...u[y]),this.$=u[y];break;case 55:g.addComponent("component_queue",...u[y]),this.$=u[y];break;case 56:g.addComponent("external_component",...u[y]),this.$=u[y];break;case 57:g.addComponent("external_component_db",...u[y]),this.$=u[y];break;case 58:g.addComponent("external_component_queue",...u[y]),this.$=u[y];break;case 60:g.addRel("rel",...u[y]),this.$=u[y];break;case 61:g.addRel("birel",...u[y]),this.$=u[y];break;case 62:g.addRel("rel_u",...u[y]),this.$=u[y];break;case 63:g.addRel("rel_d",...u[y]),this.$=u[y];break;case 64:g.addRel("rel_l",...u[y]),this.$=u[y];break;case 65:g.addRel("rel_r",...u[y]),this.$=u[y];break;case 66:g.addRel("rel_b",...u[y]),this.$=u[y];break;case 67:u[y].splice(0,1),g.addRel("rel",...u[y]),this.$=u[y];break;case 68:g.updateElStyle("update_el_style",...u[y]),this.$=u[y];break;case 69:g.updateRelStyle("update_rel_style",...u[y]),this.$=u[y];break;case 70:g.updateLayoutConfig("update_layout_config",...u[y]),this.$=u[y];break;case 71:this.$=[u[y]];break;case 72:u[y].unshift(u[y-1]),this.$=u[y];break;case 73:case 75:this.$=u[y].trim();break;case 74:let Et={};Et[u[y-1].trim()]=u[y].trim(),this.$=Et;break;case 76:this.$="";break}},table:[{3:1,4:2,5:3,6:[1,5],7:[1,6],8:[1,7],9:[1,8],10:4,11:[1,9],15:[1,10],16:[1,11],17:[1,12],18:[1,13]},{1:[3]},{1:[2,1]},{1:[2,2]},{1:[2,7]},{1:[2,3]},{1:[2,4]},{1:[2,5]},{1:[2,6]},{12:[1,14]},{12:[1,15]},{12:[1,16]},{12:[1,17]},{12:[1,18]},{13:19,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{13:70,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{13:71,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{13:72,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{13:73,19:20,20:21,21:22,22:t,23:a,24:o,26:l,28:i,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{14:[1,74]},e(At,[2,13],{43:23,29:49,30:61,32:62,20:75,34:s,36:r,37:n,38:h,39:f,40:d,41:p,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt}),e(At,[2,14]),e(Vt,[2,16],{12:[1,76]}),e(At,[2,36],{12:[1,77]}),e(Ot,[2,19]),e(Ot,[2,20]),{25:[1,78]},{27:[1,79]},e(Ot,[2,23]),{35:80,75:81,76:v,77:k,79:A,80:C},{35:86,75:81,76:v,77:k,79:A,80:C},{35:87,75:81,76:v,77:k,79:A,80:C},{35:88,75:81,76:v,77:k,79:A,80:C},{35:89,75:81,76:v,77:k,79:A,80:C},{35:90,75:81,76:v,77:k,79:A,80:C},{35:91,75:81,76:v,77:k,79:A,80:C},{35:92,75:81,76:v,77:k,79:A,80:C},{35:93,75:81,76:v,77:k,79:A,80:C},{35:94,75:81,76:v,77:k,79:A,80:C},{35:95,75:81,76:v,77:k,79:A,80:C},{35:96,75:81,76:v,77:k,79:A,80:C},{35:97,75:81,76:v,77:k,79:A,80:C},{35:98,75:81,76:v,77:k,79:A,80:C},{35:99,75:81,76:v,77:k,79:A,80:C},{35:100,75:81,76:v,77:k,79:A,80:C},{35:101,75:81,76:v,77:k,79:A,80:C},{35:102,75:81,76:v,77:k,79:A,80:C},{35:103,75:81,76:v,77:k,79:A,80:C},{35:104,75:81,76:v,77:k,79:A,80:C},e(w,[2,59]),{35:105,75:81,76:v,77:k,79:A,80:C},{35:106,75:81,76:v,77:k,79:A,80:C},{35:107,75:81,76:v,77:k,79:A,80:C},{35:108,75:81,76:v,77:k,79:A,80:C},{35:109,75:81,76:v,77:k,79:A,80:C},{35:110,75:81,76:v,77:k,79:A,80:C},{35:111,75:81,76:v,77:k,79:A,80:C},{35:112,75:81,76:v,77:k,79:A,80:C},{35:113,75:81,76:v,77:k,79:A,80:C},{35:114,75:81,76:v,77:k,79:A,80:C},{35:115,75:81,76:v,77:k,79:A,80:C},{20:116,29:49,30:61,32:62,34:s,36:r,37:n,38:h,39:f,40:d,41:p,43:23,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt},{12:[1,118],33:[1,117]},{35:119,75:81,76:v,77:k,79:A,80:C},{35:120,75:81,76:v,77:k,79:A,80:C},{35:121,75:81,76:v,77:k,79:A,80:C},{35:122,75:81,76:v,77:k,79:A,80:C},{35:123,75:81,76:v,77:k,79:A,80:C},{35:124,75:81,76:v,77:k,79:A,80:C},{35:125,75:81,76:v,77:k,79:A,80:C},{14:[1,126]},{14:[1,127]},{14:[1,128]},{14:[1,129]},{1:[2,8]},e(At,[2,15]),e(Vt,[2,17],{21:22,19:130,22:t,23:a,24:o,26:l,28:i}),e(At,[2,37],{19:20,20:21,21:22,43:23,29:49,30:61,32:62,13:131,22:t,23:a,24:o,26:l,28:i,34:s,36:r,37:n,38:h,39:f,40:d,41:p,44:E,45:O,46:R,47:S,48:L,49:Y,50:Q,51:H,52:q,53:G,54:K,55:J,56:Z,57:$,58:tt,59:et,60:it,61:nt,62:st,63:at,64:rt,65:lt,66:ot,67:ct,68:ht,69:ut,70:dt,71:ft,72:pt,73:yt,74:gt}),e(Ot,[2,21]),e(Ot,[2,22]),e(w,[2,39]),e(ne,[2,71],{75:81,35:132,76:v,77:k,79:A,80:C}),e(Pt,[2,73]),{78:[1,133]},e(Pt,[2,75]),e(Pt,[2,76]),e(w,[2,40]),e(w,[2,41]),e(w,[2,42]),e(w,[2,43]),e(w,[2,44]),e(w,[2,45]),e(w,[2,46]),e(w,[2,47]),e(w,[2,48]),e(w,[2,49]),e(w,[2,50]),e(w,[2,51]),e(w,[2,52]),e(w,[2,53]),e(w,[2,54]),e(w,[2,55]),e(w,[2,56]),e(w,[2,57]),e(w,[2,58]),e(w,[2,60]),e(w,[2,61]),e(w,[2,62]),e(w,[2,63]),e(w,[2,64]),e(w,[2,65]),e(w,[2,66]),e(w,[2,67]),e(w,[2,68]),e(w,[2,69]),e(w,[2,70]),{31:134,42:[1,135]},{12:[1,136]},{33:[1,137]},e(mt,[2,28]),e(mt,[2,29]),e(mt,[2,30]),e(mt,[2,31]),e(mt,[2,32]),e(mt,[2,33]),e(mt,[2,34]),{1:[2,9]},{1:[2,10]},{1:[2,11]},{1:[2,12]},e(Vt,[2,18]),e(At,[2,38]),e(ne,[2,72]),e(Pt,[2,74]),e(w,[2,24]),e(w,[2,35]),e(zt,[2,25]),e(zt,[2,26],{12:[1,138]}),e(zt,[2,27])],defaultActions:{2:[2,1],3:[2,2],4:[2,7],5:[2,3],6:[2,4],7:[2,5],8:[2,6],74:[2,8],126:[2,9],127:[2,10],128:[2,11],129:[2,12]},parseError:function(_,x){if(x.recoverable)this.trace(_);else{var m=new Error(_);throw m.hash=x,m}},parse:function(_){var x=this,m=[0],g=[],T=[null],u=[],Tt=this.table,y="",Et=0,se=0,ve=2,ae=1,ke=u.slice.call(arguments,1),D=Object.create(this.lexer),vt={yy:{}};for(var Qt in this.yy)Object.prototype.hasOwnProperty.call(this.yy,Qt)&&(vt.yy[Qt]=this.yy[Qt]);D.setInput(_,vt.yy),vt.yy.lexer=D,vt.yy.parser=this,typeof D.yylloc>"u"&&(D.yylloc={});var Ht=D.yylloc;u.push(Ht);var Ae=D.options&&D.options.ranges;typeof vt.yy.parseError=="function"?this.parseError=vt.yy.parseError:this.parseError=Object.getPrototypeOf(this).parseError;function Ce(){var X;return X=g.pop()||D.lex()||ae,typeof X!="number"&&(X instanceof Array&&(g=X,X=g.pop()),X=x.symbols_[X]||X),X}for(var M,kt,N,qt,Ct={},Mt,z,re,Lt;;){if(kt=m[m.length-1],this.defaultActions[kt]?N=this.defaultActions[kt]:((M===null||typeof M>"u")&&(M=Ce()),N=Tt[kt]&&Tt[kt][M]),typeof N>"u"||!N.length||!N[0]){var Gt="";Lt=[];for(Mt in Tt[kt])this.terminals_[Mt]&&Mt>ve&&Lt.push("'"+this.terminals_[Mt]+"'");D.showPosition?Gt="Parse error on line "+(Et+1)+`:
2
+ `+D.showPosition()+`
3
+ Expecting `+Lt.join(", ")+", got '"+(this.terminals_[M]||M)+"'":Gt="Parse error on line "+(Et+1)+": Unexpected "+(M==ae?"end of input":"'"+(this.terminals_[M]||M)+"'"),this.parseError(Gt,{text:D.match,token:this.terminals_[M]||M,line:D.yylineno,loc:Ht,expected:Lt})}if(N[0]instanceof Array&&N.length>1)throw new Error("Parse Error: multiple actions possible at state: "+kt+", token: "+M);switch(N[0]){case 1:m.push(M),T.push(D.yytext),u.push(D.yylloc),m.push(N[1]),M=null,se=D.yyleng,y=D.yytext,Et=D.yylineno,Ht=D.yylloc;break;case 2:if(z=this.productions_[N[1]][1],Ct.$=T[T.length-z],Ct._$={first_line:u[u.length-(z||1)].first_line,last_line:u[u.length-1].last_line,first_column:u[u.length-(z||1)].first_column,last_column:u[u.length-1].last_column},Ae&&(Ct._$.range=[u[u.length-(z||1)].range[0],u[u.length-1].range[1]]),qt=this.performAction.apply(Ct,[y,se,Et,vt.yy,N[1],T,u].concat(ke)),typeof qt<"u")return qt;z&&(m=m.slice(0,-1*z*2),T=T.slice(0,-1*z),u=u.slice(0,-1*z)),m.push(this.productions_[N[1]][0]),T.push(Ct.$),u.push(Ct._$),re=Tt[m[m.length-2]][m[m.length-1]],m.push(re);break;case 3:return!0}}return!0}},Ee=function(){var bt={EOF:1,parseError:function(x,m){if(this.yy.parser)this.yy.parser.parseError(x,m);else throw new Error(x)},setInput:function(_,x){return this.yy=x||this.yy||{},this._input=_,this._more=this._backtrack=this.done=!1,this.yylineno=this.yyleng=0,this.yytext=this.matched=this.match="",this.conditionStack=["INITIAL"],this.yylloc={first_line:1,first_column:0,last_line:1,last_column:0},this.options.ranges&&(this.yylloc.range=[0,0]),this.offset=0,this},input:function(){var _=this._input[0];this.yytext+=_,this.yyleng++,this.offset++,this.match+=_,this.matched+=_;var x=_.match(/(?:\r\n?|\n).*/g);return x?(this.yylineno++,this.yylloc.last_line++):this.yylloc.last_column++,this.options.ranges&&this.yylloc.range[1]++,this._input=this._input.slice(1),_},unput:function(_){var x=_.length,m=_.split(/(?:\r\n?|\n)/g);this._input=_+this._input,this.yytext=this.yytext.substr(0,this.yytext.length-x),this.offset-=x;var g=this.match.split(/(?:\r\n?|\n)/g);this.match=this.match.substr(0,this.match.length-1),this.matched=this.matched.substr(0,this.matched.length-1),m.length-1&&(this.yylineno-=m.length-1);var T=this.yylloc.range;return this.yylloc={first_line:this.yylloc.first_line,last_line:this.yylineno+1,first_column:this.yylloc.first_column,last_column:m?(m.length===g.length?this.yylloc.first_column:0)+g[g.length-m.length].length-m[0].length:this.yylloc.first_column-x},this.options.ranges&&(this.yylloc.range=[T[0],T[0]+this.yyleng-x]),this.yyleng=this.yytext.length,this},more:function(){return this._more=!0,this},reject:function(){if(this.options.backtrack_lexer)this._backtrack=!0;else return this.parseError("Lexical error on line "+(this.yylineno+1)+`. You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).
4
+ `+this.showPosition(),{text:"",token:null,line:this.yylineno});return this},less:function(_){this.unput(this.match.slice(_))},pastInput:function(){var _=this.matched.substr(0,this.matched.length-this.match.length);return(_.length>20?"...":"")+_.substr(-20).replace(/\n/g,"")},upcomingInput:function(){var _=this.match;return _.length<20&&(_+=this._input.substr(0,20-_.length)),(_.substr(0,20)+(_.length>20?"...":"")).replace(/\n/g,"")},showPosition:function(){var _=this.pastInput(),x=new Array(_.length+1).join("-");return _+this.upcomingInput()+`
5
+ `+x+"^"},test_match:function(_,x){var m,g,T;if(this.options.backtrack_lexer&&(T={yylineno:this.yylineno,yylloc:{first_line:this.yylloc.first_line,last_line:this.last_line,first_column:this.yylloc.first_column,last_column:this.yylloc.last_column},yytext:this.yytext,match:this.match,matches:this.matches,matched:this.matched,yyleng:this.yyleng,offset:this.offset,_more:this._more,_input:this._input,yy:this.yy,conditionStack:this.conditionStack.slice(0),done:this.done},this.options.ranges&&(T.yylloc.range=this.yylloc.range.slice(0))),g=_[0].match(/(?:\r\n?|\n).*/g),g&&(this.yylineno+=g.length),this.yylloc={first_line:this.yylloc.last_line,last_line:this.yylineno+1,first_column:this.yylloc.last_column,last_column:g?g[g.length-1].length-g[g.length-1].match(/\r?\n?/)[0].length:this.yylloc.last_column+_[0].length},this.yytext+=_[0],this.match+=_[0],this.matches=_,this.yyleng=this.yytext.length,this.options.ranges&&(this.yylloc.range=[this.offset,this.offset+=this.yyleng]),this._more=!1,this._backtrack=!1,this._input=this._input.slice(_[0].length),this.matched+=_[0],m=this.performAction.call(this,this.yy,this,x,this.conditionStack[this.conditionStack.length-1]),this.done&&this._input&&(this.done=!1),m)return m;if(this._backtrack){for(var u in T)this[u]=T[u];return!1}return!1},next:function(){if(this.done)return this.EOF;this._input||(this.done=!0);var _,x,m,g;this._more||(this.yytext="",this.match="");for(var T=this._currentRules(),u=0;u<T.length;u++)if(m=this._input.match(this.rules[T[u]]),m&&(!x||m[0].length>x[0].length)){if(x=m,g=u,this.options.backtrack_lexer){if(_=this.test_match(m,T[u]),_!==!1)return _;if(this._backtrack){x=!1;continue}else return!1}else if(!this.options.flex)break}return x?(_=this.test_match(x,T[g]),_!==!1?_:!1):this._input===""?this.EOF:this.parseError("Lexical error on line "+(this.yylineno+1)+`. Unrecognized text.
6
+ `+this.showPosition(),{text:"",token:null,line:this.yylineno})},lex:function(){var x=this.next();return x||this.lex()},begin:function(x){this.conditionStack.push(x)},popState:function(){var x=this.conditionStack.length-1;return x>0?this.conditionStack.pop():this.conditionStack[0]},_currentRules:function(){return this.conditionStack.length&&this.conditionStack[this.conditionStack.length-1]?this.conditions[this.conditionStack[this.conditionStack.length-1]].rules:this.conditions.INITIAL.rules},topState:function(x){return x=this.conditionStack.length-1-Math.abs(x||0),x>=0?this.conditionStack[x]:"INITIAL"},pushState:function(x){this.begin(x)},stateStackSize:function(){return this.conditionStack.length},options:{},performAction:function(x,m,g,T){switch(g){case 0:return 6;case 1:return 7;case 2:return 8;case 3:return 9;case 4:return 22;case 5:return 23;case 6:return this.begin("acc_title"),24;case 7:return this.popState(),"acc_title_value";case 8:return this.begin("acc_descr"),26;case 9:return this.popState(),"acc_descr_value";case 10:this.begin("acc_descr_multiline");break;case 11:this.popState();break;case 12:return"acc_descr_multiline_value";case 13:break;case 14:c;break;case 15:return 12;case 16:break;case 17:return 11;case 18:return 15;case 19:return 16;case 20:return 17;case 21:return 18;case 22:return this.begin("person_ext"),45;case 23:return this.begin("person"),44;case 24:return this.begin("system_ext_queue"),51;case 25:return this.begin("system_ext_db"),50;case 26:return this.begin("system_ext"),49;case 27:return this.begin("system_queue"),48;case 28:return this.begin("system_db"),47;case 29:return this.begin("system"),46;case 30:return this.begin("boundary"),37;case 31:return this.begin("enterprise_boundary"),34;case 32:return this.begin("system_boundary"),36;case 33:return this.begin("container_ext_queue"),57;case 34:return this.begin("container_ext_db"),56;case 35:return this.begin("container_ext"),55;case 36:return this.begin("container_queue"),54;case 37:return this.begin("container_db"),53;case 38:return this.begin("container"),52;case 39:return this.begin("container_boundary"),38;case 40:return this.begin("component_ext_queue"),63;case 41:return this.begin("component_ext_db"),62;case 42:return this.begin("component_ext"),61;case 43:return this.begin("component_queue"),60;case 44:return this.begin("component_db"),59;case 45:return this.begin("component"),58;case 46:return this.begin("node"),39;case 47:return this.begin("node"),39;case 48:return this.begin("node_l"),40;case 49:return this.begin("node_r"),41;case 50:return this.begin("rel"),64;case 51:return this.begin("birel"),65;case 52:return this.begin("rel_u"),66;case 53:return this.begin("rel_u"),66;case 54:return this.begin("rel_d"),67;case 55:return this.begin("rel_d"),67;case 56:return this.begin("rel_l"),68;case 57:return this.begin("rel_l"),68;case 58:return this.begin("rel_r"),69;case 59:return this.begin("rel_r"),69;case 60:return this.begin("rel_b"),70;case 61:return this.begin("rel_index"),71;case 62:return this.begin("update_el_style"),72;case 63:return this.begin("update_rel_style"),73;case 64:return this.begin("update_layout_config"),74;case 65:return"EOF_IN_STRUCT";case 66:return this.begin("attribute"),"ATTRIBUTE_EMPTY";case 67:this.begin("attribute");break;case 68:this.popState(),this.popState();break;case 69:return 80;case 70:break;case 71:return 80;case 72:this.begin("string");break;case 73:this.popState();break;case 74:return"STR";case 75:this.begin("string_kv");break;case 76:return this.begin("string_kv_key"),"STR_KEY";case 77:this.popState(),this.begin("string_kv_value");break;case 78:return"STR_VALUE";case 79:this.popState(),this.popState();break;case 80:return"STR";case 81:return"LBRACE";case 82:return"RBRACE";case 83:return"SPACE";case 84:return"EOL";case 85:return 14}},rules:[/^(?:.*direction\s+TB[^\n]*)/,/^(?:.*direction\s+BT[^\n]*)/,/^(?:.*direction\s+RL[^\n]*)/,/^(?:.*direction\s+LR[^\n]*)/,/^(?:title\s[^#\n;]+)/,/^(?:accDescription\s[^#\n;]+)/,/^(?:accTitle\s*:\s*)/,/^(?:(?!\n||)*[^\n]*)/,/^(?:accDescr\s*:\s*)/,/^(?:(?!\n||)*[^\n]*)/,/^(?:accDescr\s*\{\s*)/,/^(?:[\}])/,/^(?:[^\}]*)/,/^(?:%%(?!\{)*[^\n]*(\r?\n?)+)/,/^(?:%%[^\n]*(\r?\n)*)/,/^(?:\s*(\r?\n)+)/,/^(?:\s+)/,/^(?:C4Context\b)/,/^(?:C4Container\b)/,/^(?:C4Component\b)/,/^(?:C4Dynamic\b)/,/^(?:C4Deployment\b)/,/^(?:Person_Ext\b)/,/^(?:Person\b)/,/^(?:SystemQueue_Ext\b)/,/^(?:SystemDb_Ext\b)/,/^(?:System_Ext\b)/,/^(?:SystemQueue\b)/,/^(?:SystemDb\b)/,/^(?:System\b)/,/^(?:Boundary\b)/,/^(?:Enterprise_Boundary\b)/,/^(?:System_Boundary\b)/,/^(?:ContainerQueue_Ext\b)/,/^(?:ContainerDb_Ext\b)/,/^(?:Container_Ext\b)/,/^(?:ContainerQueue\b)/,/^(?:ContainerDb\b)/,/^(?:Container\b)/,/^(?:Container_Boundary\b)/,/^(?:ComponentQueue_Ext\b)/,/^(?:ComponentDb_Ext\b)/,/^(?:Component_Ext\b)/,/^(?:ComponentQueue\b)/,/^(?:ComponentDb\b)/,/^(?:Component\b)/,/^(?:Deployment_Node\b)/,/^(?:Node\b)/,/^(?:Node_L\b)/,/^(?:Node_R\b)/,/^(?:Rel\b)/,/^(?:BiRel\b)/,/^(?:Rel_Up\b)/,/^(?:Rel_U\b)/,/^(?:Rel_Down\b)/,/^(?:Rel_D\b)/,/^(?:Rel_Left\b)/,/^(?:Rel_L\b)/,/^(?:Rel_Right\b)/,/^(?:Rel_R\b)/,/^(?:Rel_Back\b)/,/^(?:RelIndex\b)/,/^(?:UpdateElementStyle\b)/,/^(?:UpdateRelStyle\b)/,/^(?:UpdateLayoutConfig\b)/,/^(?:$)/,/^(?:[(][ ]*[,])/,/^(?:[(])/,/^(?:[)])/,/^(?:,,)/,/^(?:,)/,/^(?:[ ]*["]["])/,/^(?:[ ]*["])/,/^(?:["])/,/^(?:[^"]*)/,/^(?:[ ]*[\$])/,/^(?:[^=]*)/,/^(?:[=][ ]*["])/,/^(?:[^"]+)/,/^(?:["])/,/^(?:[^,]+)/,/^(?:\{)/,/^(?:\})/,/^(?:[\s]+)/,/^(?:[\n\r]+)/,/^(?:$)/],conditions:{acc_descr_multiline:{rules:[11,12],inclusive:!1},acc_descr:{rules:[9],inclusive:!1},acc_title:{rules:[7],inclusive:!1},string_kv_value:{rules:[78,79],inclusive:!1},string_kv_key:{rules:[77],inclusive:!1},string_kv:{rules:[76],inclusive:!1},string:{rules:[73,74],inclusive:!1},attribute:{rules:[68,69,70,71,72,75,80],inclusive:!1},update_layout_config:{rules:[65,66,67,68],inclusive:!1},update_rel_style:{rules:[65,66,67,68],inclusive:!1},update_el_style:{rules:[65,66,67,68],inclusive:!1},rel_b:{rules:[65,66,67,68],inclusive:!1},rel_r:{rules:[65,66,67,68],inclusive:!1},rel_l:{rules:[65,66,67,68],inclusive:!1},rel_d:{rules:[65,66,67,68],inclusive:!1},rel_u:{rules:[65,66,67,68],inclusive:!1},rel_bi:{rules:[],inclusive:!1},rel:{rules:[65,66,67,68],inclusive:!1},node_r:{rules:[65,66,67,68],inclusive:!1},node_l:{rules:[65,66,67,68],inclusive:!1},node:{rules:[65,66,67,68],inclusive:!1},index:{rules:[],inclusive:!1},rel_index:{rules:[65,66,67,68],inclusive:!1},component_ext_queue:{rules:[],inclusive:!1},component_ext_db:{rules:[65,66,67,68],inclusive:!1},component_ext:{rules:[65,66,67,68],inclusive:!1},component_queue:{rules:[65,66,67,68],inclusive:!1},component_db:{rules:[65,66,67,68],inclusive:!1},component:{rules:[65,66,67,68],inclusive:!1},container_boundary:{rules:[65,66,67,68],inclusive:!1},container_ext_queue:{rules:[65,66,67,68],inclusive:!1},container_ext_db:{rules:[65,66,67,68],inclusive:!1},container_ext:{rules:[65,66,67,68],inclusive:!1},container_queue:{rules:[65,66,67,68],inclusive:!1},container_db:{rules:[65,66,67,68],inclusive:!1},container:{rules:[65,66,67,68],inclusive:!1},birel:{rules:[65,66,67,68],inclusive:!1},system_boundary:{rules:[65,66,67,68],inclusive:!1},enterprise_boundary:{rules:[65,66,67,68],inclusive:!1},boundary:{rules:[65,66,67,68],inclusive:!1},system_ext_queue:{rules:[65,66,67,68],inclusive:!1},system_ext_db:{rules:[65,66,67,68],inclusive:!1},system_ext:{rules:[65,66,67,68],inclusive:!1},system_queue:{rules:[65,66,67,68],inclusive:!1},system_db:{rules:[65,66,67,68],inclusive:!1},system:{rules:[65,66,67,68],inclusive:!1},person_ext:{rules:[65,66,67,68],inclusive:!1},person:{rules:[65,66,67,68],inclusive:!1},INITIAL:{rules:[0,1,2,3,4,5,6,8,10,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,81,82,83,84,85],inclusive:!0}}};return bt}();Xt.lexer=Ee;function Wt(){this.yy={}}return Wt.prototype=Xt,Xt.Parser=Wt,new Wt}();Yt.parser=Yt;const Be=Yt;let U=[],_t=[""],P="global",j="",V=[{alias:"global",label:{text:"global"},type:{text:"global"},tags:null,link:null,parentBoundary:""}],St=[],te="",ee=!1,It=4,jt=2;var de;const Ye=function(){return de},Ie=function(e){de=ue(e,Dt())},je=function(e,t,a,o,l,i,s,r,n){if(e==null||t===void 0||t===null||a===void 0||a===null||o===void 0||o===null)return;let h={};const f=St.find(d=>d.from===t&&d.to===a);if(f?h=f:St.push(h),h.type=e,h.from=t,h.to=a,h.label={text:o},l==null)h.techn={text:""};else if(typeof l=="object"){let[d,p]=Object.entries(l)[0];h[d]={text:p}}else h.techn={text:l};if(i==null)h.descr={text:""};else if(typeof i=="object"){let[d,p]=Object.entries(i)[0];h[d]={text:p}}else h.descr={text:i};if(typeof s=="object"){let[d,p]=Object.entries(s)[0];h[d]=p}else h.sprite=s;if(typeof r=="object"){let[d,p]=Object.entries(r)[0];h[d]=p}else h.tags=r;if(typeof n=="object"){let[d,p]=Object.entries(n)[0];h[d]=p}else h.link=n;h.wrap=xt()},Ue=function(e,t,a,o,l,i,s){if(t===null||a===null)return;let r={};const n=U.find(h=>h.alias===t);if(n&&t===n.alias?r=n:(r.alias=t,U.push(r)),a==null?r.label={text:""}:r.label={text:a},o==null)r.descr={text:""};else if(typeof o=="object"){let[h,f]=Object.entries(o)[0];r[h]={text:f}}else r.descr={text:o};if(typeof l=="object"){let[h,f]=Object.entries(l)[0];r[h]=f}else r.sprite=l;if(typeof i=="object"){let[h,f]=Object.entries(i)[0];r[h]=f}else r.tags=i;if(typeof s=="object"){let[h,f]=Object.entries(s)[0];r[h]=f}else r.link=s;r.typeC4Shape={text:e},r.parentBoundary=P,r.wrap=xt()},Fe=function(e,t,a,o,l,i,s,r){if(t===null||a===null)return;let n={};const h=U.find(f=>f.alias===t);if(h&&t===h.alias?n=h:(n.alias=t,U.push(n)),a==null?n.label={text:""}:n.label={text:a},o==null)n.techn={text:""};else if(typeof o=="object"){let[f,d]=Object.entries(o)[0];n[f]={text:d}}else n.techn={text:o};if(l==null)n.descr={text:""};else if(typeof l=="object"){let[f,d]=Object.entries(l)[0];n[f]={text:d}}else n.descr={text:l};if(typeof i=="object"){let[f,d]=Object.entries(i)[0];n[f]=d}else n.sprite=i;if(typeof s=="object"){let[f,d]=Object.entries(s)[0];n[f]=d}else n.tags=s;if(typeof r=="object"){let[f,d]=Object.entries(r)[0];n[f]=d}else n.link=r;n.wrap=xt(),n.typeC4Shape={text:e},n.parentBoundary=P},Ve=function(e,t,a,o,l,i,s,r){if(t===null||a===null)return;let n={};const h=U.find(f=>f.alias===t);if(h&&t===h.alias?n=h:(n.alias=t,U.push(n)),a==null?n.label={text:""}:n.label={text:a},o==null)n.techn={text:""};else if(typeof o=="object"){let[f,d]=Object.entries(o)[0];n[f]={text:d}}else n.techn={text:o};if(l==null)n.descr={text:""};else if(typeof l=="object"){let[f,d]=Object.entries(l)[0];n[f]={text:d}}else n.descr={text:l};if(typeof i=="object"){let[f,d]=Object.entries(i)[0];n[f]=d}else n.sprite=i;if(typeof s=="object"){let[f,d]=Object.entries(s)[0];n[f]=d}else n.tags=s;if(typeof r=="object"){let[f,d]=Object.entries(r)[0];n[f]=d}else n.link=r;n.wrap=xt(),n.typeC4Shape={text:e},n.parentBoundary=P},ze=function(e,t,a,o,l){if(e===null||t===null)return;let i={};const s=V.find(r=>r.alias===e);if(s&&e===s.alias?i=s:(i.alias=e,V.push(i)),t==null?i.label={text:""}:i.label={text:t},a==null)i.type={text:"system"};else if(typeof a=="object"){let[r,n]=Object.entries(a)[0];i[r]={text:n}}else i.type={text:a};if(typeof o=="object"){let[r,n]=Object.entries(o)[0];i[r]=n}else i.tags=o;if(typeof l=="object"){let[r,n]=Object.entries(l)[0];i[r]=n}else i.link=l;i.parentBoundary=P,i.wrap=xt(),j=P,P=e,_t.push(j)},Xe=function(e,t,a,o,l){if(e===null||t===null)return;let i={};const s=V.find(r=>r.alias===e);if(s&&e===s.alias?i=s:(i.alias=e,V.push(i)),t==null?i.label={text:""}:i.label={text:t},a==null)i.type={text:"container"};else if(typeof a=="object"){let[r,n]=Object.entries(a)[0];i[r]={text:n}}else i.type={text:a};if(typeof o=="object"){let[r,n]=Object.entries(o)[0];i[r]=n}else i.tags=o;if(typeof l=="object"){let[r,n]=Object.entries(l)[0];i[r]=n}else i.link=l;i.parentBoundary=P,i.wrap=xt(),j=P,P=e,_t.push(j)},We=function(e,t,a,o,l,i,s,r){if(t===null||a===null)return;let n={};const h=V.find(f=>f.alias===t);if(h&&t===h.alias?n=h:(n.alias=t,V.push(n)),a==null?n.label={text:""}:n.label={text:a},o==null)n.type={text:"node"};else if(typeof o=="object"){let[f,d]=Object.entries(o)[0];n[f]={text:d}}else n.type={text:o};if(l==null)n.descr={text:""};else if(typeof l=="object"){let[f,d]=Object.entries(l)[0];n[f]={text:d}}else n.descr={text:l};if(typeof s=="object"){let[f,d]=Object.entries(s)[0];n[f]=d}else n.tags=s;if(typeof r=="object"){let[f,d]=Object.entries(r)[0];n[f]=d}else n.link=r;n.nodeType=e,n.parentBoundary=P,n.wrap=xt(),j=P,P=t,_t.push(j)},Qe=function(){P=j,_t.pop(),j=_t.pop(),_t.push(j)},He=function(e,t,a,o,l,i,s,r,n,h,f){let d=U.find(p=>p.alias===t);if(!(d===void 0&&(d=V.find(p=>p.alias===t),d===void 0))){if(a!=null)if(typeof a=="object"){let[p,E]=Object.entries(a)[0];d[p]=E}else d.bgColor=a;if(o!=null)if(typeof o=="object"){let[p,E]=Object.entries(o)[0];d[p]=E}else d.fontColor=o;if(l!=null)if(typeof l=="object"){let[p,E]=Object.entries(l)[0];d[p]=E}else d.borderColor=l;if(i!=null)if(typeof i=="object"){let[p,E]=Object.entries(i)[0];d[p]=E}else d.shadowing=i;if(s!=null)if(typeof s=="object"){let[p,E]=Object.entries(s)[0];d[p]=E}else d.shape=s;if(r!=null)if(typeof r=="object"){let[p,E]=Object.entries(r)[0];d[p]=E}else d.sprite=r;if(n!=null)if(typeof n=="object"){let[p,E]=Object.entries(n)[0];d[p]=E}else d.techn=n;if(h!=null)if(typeof h=="object"){let[p,E]=Object.entries(h)[0];d[p]=E}else d.legendText=h;if(f!=null)if(typeof f=="object"){let[p,E]=Object.entries(f)[0];d[p]=E}else d.legendSprite=f}},qe=function(e,t,a,o,l,i,s){const r=St.find(n=>n.from===t&&n.to===a);if(r!==void 0){if(o!=null)if(typeof o=="object"){let[n,h]=Object.entries(o)[0];r[n]=h}else r.textColor=o;if(l!=null)if(typeof l=="object"){let[n,h]=Object.entries(l)[0];r[n]=h}else r.lineColor=l;if(i!=null)if(typeof i=="object"){let[n,h]=Object.entries(i)[0];r[n]=parseInt(h)}else r.offsetX=parseInt(i);if(s!=null)if(typeof s=="object"){let[n,h]=Object.entries(s)[0];r[n]=parseInt(h)}else r.offsetY=parseInt(s)}},Ge=function(e,t,a){let o=It,l=jt;if(typeof t=="object"){const i=Object.values(t)[0];o=parseInt(i)}else o=parseInt(t);if(typeof a=="object"){const i=Object.values(a)[0];l=parseInt(i)}else l=parseInt(a);o>=1&&(It=o),l>=1&&(jt=l)},Ke=function(){return It},Je=function(){return jt},Ze=function(){return P},$e=function(){return j},fe=function(e){return e==null?U:U.filter(t=>t.parentBoundary===e)},t0=function(e){return U.find(t=>t.alias===e)},e0=function(e){return Object.keys(fe(e))},pe=function(e){return e==null?V:V.filter(t=>t.parentBoundary===e)},i0=pe,n0=function(){return St},s0=function(){return te},a0=function(e){ee=e},xt=function(){return ee},r0=function(){U=[],V=[{alias:"global",label:{text:"global"},type:{text:"global"},tags:null,link:null,parentBoundary:""}],j="",P="global",_t=[""],St=[],_t=[""],te="",ee=!1,It=4,jt=2},l0={SOLID:0,DOTTED:1,NOTE:2,SOLID_CROSS:3,DOTTED_CROSS:4,SOLID_OPEN:5,DOTTED_OPEN:6,LOOP_START:10,LOOP_END:11,ALT_START:12,ALT_ELSE:13,ALT_END:14,OPT_START:15,OPT_END:16,ACTIVE_START:17,ACTIVE_END:18,PAR_START:19,PAR_AND:20,PAR_END:21,RECT_START:22,RECT_END:23,SOLID_POINT:24,DOTTED_POINT:25},o0={FILLED:0,OPEN:1},c0={LEFTOF:0,RIGHTOF:1,OVER:2},h0=function(e){te=ue(e,Dt())},Jt={addPersonOrSystem:Ue,addPersonOrSystemBoundary:ze,addContainer:Fe,addContainerBoundary:Xe,addComponent:Ve,addDeploymentNode:We,popBoundaryParseStack:Qe,addRel:je,updateElStyle:He,updateRelStyle:qe,updateLayoutConfig:Ge,autoWrap:xt,setWrap:a0,getC4ShapeArray:fe,getC4Shape:t0,getC4ShapeKeys:e0,getBoundaries:pe,getBoundarys:i0,getCurrentBoundaryParse:Ze,getParentBoundaryParse:$e,getRels:n0,getTitle:s0,getC4Type:Ye,getC4ShapeInRow:Ke,getC4BoundaryInRow:Je,setAccTitle:Re,getAccTitle:Te,getAccDescription:Oe,setAccDescription:we,getConfig:()=>Dt().c4,clear:r0,LINETYPE:l0,ARROWTYPE:o0,PLACEMENT:c0,setTitle:h0,setC4Type:Ie},ie=function(e,t){return Le(e,t)},ye=function(e,t,a,o,l,i){const s=e.append("image");s.attr("width",t),s.attr("height",a),s.attr("x",o),s.attr("y",l);let r=i.startsWith("data:image/png;base64")?i:Pe.sanitizeUrl(i);s.attr("xlink:href",r)},u0=(e,t,a)=>{const o=e.append("g");let l=0;for(let i of t){let s=i.textColor?i.textColor:"#444444",r=i.lineColor?i.lineColor:"#444444",n=i.offsetX?parseInt(i.offsetX):0,h=i.offsetY?parseInt(i.offsetY):0,f="";if(l===0){let p=o.append("line");p.attr("x1",i.startPoint.x),p.attr("y1",i.startPoint.y),p.attr("x2",i.endPoint.x),p.attr("y2",i.endPoint.y),p.attr("stroke-width","1"),p.attr("stroke",r),p.style("fill","none"),i.type!=="rel_b"&&p.attr("marker-end","url("+f+"#arrowhead)"),(i.type==="birel"||i.type==="rel_b")&&p.attr("marker-start","url("+f+"#arrowend)"),l=-1}else{let p=o.append("path");p.attr("fill","none").attr("stroke-width","1").attr("stroke",r).attr("d","Mstartx,starty Qcontrolx,controly stopx,stopy ".replaceAll("startx",i.startPoint.x).replaceAll("starty",i.startPoint.y).replaceAll("controlx",i.startPoint.x+(i.endPoint.x-i.startPoint.x)/2-(i.endPoint.x-i.startPoint.x)/4).replaceAll("controly",i.startPoint.y+(i.endPoint.y-i.startPoint.y)/2).replaceAll("stopx",i.endPoint.x).replaceAll("stopy",i.endPoint.y)),i.type!=="rel_b"&&p.attr("marker-end","url("+f+"#arrowhead)"),(i.type==="birel"||i.type==="rel_b")&&p.attr("marker-start","url("+f+"#arrowend)")}let d=a.messageFont();W(a)(i.label.text,o,Math.min(i.startPoint.x,i.endPoint.x)+Math.abs(i.endPoint.x-i.startPoint.x)/2+n,Math.min(i.startPoint.y,i.endPoint.y)+Math.abs(i.endPoint.y-i.startPoint.y)/2+h,i.label.width,i.label.height,{fill:s},d),i.techn&&i.techn.text!==""&&(d=a.messageFont(),W(a)("["+i.techn.text+"]",o,Math.min(i.startPoint.x,i.endPoint.x)+Math.abs(i.endPoint.x-i.startPoint.x)/2+n,Math.min(i.startPoint.y,i.endPoint.y)+Math.abs(i.endPoint.y-i.startPoint.y)/2+a.messageFontSize+5+h,Math.max(i.label.width,i.techn.width),i.techn.height,{fill:s,"font-style":"italic"},d))}},d0=function(e,t,a){const o=e.append("g");let l=t.bgColor?t.bgColor:"none",i=t.borderColor?t.borderColor:"#444444",s=t.fontColor?t.fontColor:"black",r={"stroke-width":1,"stroke-dasharray":"7.0,7.0"};t.nodeType&&(r={"stroke-width":1});let n={x:t.x,y:t.y,fill:l,stroke:i,width:t.width,height:t.height,rx:2.5,ry:2.5,attrs:r};ie(o,n);let h=a.boundaryFont();h.fontWeight="bold",h.fontSize=h.fontSize+2,h.fontColor=s,W(a)(t.label.text,o,t.x,t.y+t.label.Y,t.width,t.height,{fill:"#444444"},h),t.type&&t.type.text!==""&&(h=a.boundaryFont(),h.fontColor=s,W(a)(t.type.text,o,t.x,t.y+t.type.Y,t.width,t.height,{fill:"#444444"},h)),t.descr&&t.descr.text!==""&&(h=a.boundaryFont(),h.fontSize=h.fontSize-2,h.fontColor=s,W(a)(t.descr.text,o,t.x,t.y+t.descr.Y,t.width,t.height,{fill:"#444444"},h))},f0=function(e,t,a){var o;let l=t.bgColor?t.bgColor:a[t.typeC4Shape.text+"_bg_color"],i=t.borderColor?t.borderColor:a[t.typeC4Shape.text+"_border_color"],s=t.fontColor?t.fontColor:"#FFFFFF",r="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAIAAADYYG7QAAACD0lEQVR4Xu2YoU4EMRCGT+4j8Ai8AhaH4QHgAUjQuFMECUgMIUgwJAgMhgQsAYUiJCiQIBBY+EITsjfTdme6V24v4c8vyGbb+ZjOtN0bNcvjQXmkH83WvYBWto6PLm6v7p7uH1/w2fXD+PBycX1Pv2l3IdDm/vn7x+dXQiAubRzoURa7gRZWd0iGRIiJbOnhnfYBQZNJjNbuyY2eJG8fkDE3bbG4ep6MHUAsgYxmE3nVs6VsBWJSGccsOlFPmLIViMzLOB7pCVO2AtHJMohH7Fh6zqitQK7m0rJvAVYgGcEpe//PLdDz65sM4pF9N7ICcXDKIB5Nv6j7tD0NoSdM2QrU9Gg0ewE1LqBhHR3BBdvj2vapnidjHxD/q6vd7Pvhr31AwcY8eXMTXAKECZZJFXuEq27aLgQK5uLMohCenGGuGewOxSjBvYBqeG6B+Nqiblggdjnc+ZXDy+FNFpFzw76O3UBAROuXh6FoiAcf5g9eTvUgzy0nWg6I8cXHRUpg5bOVBCo+KDpFajOf23GgPme7RSQ+lacIENUgJ6gg1k6HjgOlqnLqip4tEuhv0hNEMXUD0clyXE3p6pZA0S2nnvTlXwLJEZWlb7cTQH1+USgTN4VhAenm/wea1OCAOmqo6fE1WCb9WSKBah+rbUWPWAmE2Rvk0ApiB45eOyNAzU8xcTvj8KvkKEoOaIYeHNA3ZuygAvFMUO0AAAAASUVORK5CYII=";switch(t.typeC4Shape.text){case"person":r="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAIAAADYYG7QAAACD0lEQVR4Xu2YoU4EMRCGT+4j8Ai8AhaH4QHgAUjQuFMECUgMIUgwJAgMhgQsAYUiJCiQIBBY+EITsjfTdme6V24v4c8vyGbb+ZjOtN0bNcvjQXmkH83WvYBWto6PLm6v7p7uH1/w2fXD+PBycX1Pv2l3IdDm/vn7x+dXQiAubRzoURa7gRZWd0iGRIiJbOnhnfYBQZNJjNbuyY2eJG8fkDE3bbG4ep6MHUAsgYxmE3nVs6VsBWJSGccsOlFPmLIViMzLOB7pCVO2AtHJMohH7Fh6zqitQK7m0rJvAVYgGcEpe//PLdDz65sM4pF9N7ICcXDKIB5Nv6j7tD0NoSdM2QrU9Gg0ewE1LqBhHR3BBdvj2vapnidjHxD/q6vd7Pvhr31AwcY8eXMTXAKECZZJFXuEq27aLgQK5uLMohCenGGuGewOxSjBvYBqeG6B+Nqiblggdjnc+ZXDy+FNFpFzw76O3UBAROuXh6FoiAcf5g9eTvUgzy0nWg6I8cXHRUpg5bOVBCo+KDpFajOf23GgPme7RSQ+lacIENUgJ6gg1k6HjgOlqnLqip4tEuhv0hNEMXUD0clyXE3p6pZA0S2nnvTlXwLJEZWlb7cTQH1+USgTN4VhAenm/wea1OCAOmqo6fE1WCb9WSKBah+rbUWPWAmE2Rvk0ApiB45eOyNAzU8xcTvj8KvkKEoOaIYeHNA3ZuygAvFMUO0AAAAASUVORK5CYII=";break;case"external_person":r="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAIAAADYYG7QAAAB6ElEQVR4Xu2YLY+EMBCG9+dWr0aj0Wg0Go1Go0+j8Xdv2uTCvv1gpt0ebHKPuhDaeW4605Z9mJvx4AdXUyTUdd08z+u6flmWZRnHsWkafk9DptAwDPu+f0eAYtu2PEaGWuj5fCIZrBAC2eLBAnRCsEkkxmeaJp7iDJ2QMDdHsLg8SxKFEJaAo8lAXnmuOFIhTMpxxKATebo4UiFknuNo4OniSIXQyRxEA3YsnjGCVEjVXD7yLUAqxBGUyPv/Y4W2beMgGuS7kVQIBycH0fD+oi5pezQETxdHKmQKGk1eQEYldK+jw5GxPfZ9z7Mk0Qnhf1W1m3w//EUn5BDmSZsbR44QQLBEqrBHqOrmSKaQAxdnLArCrxZcM7A7ZKs4ioRq8LFC+NpC3WCBJsvpVw5edm9iEXFuyNfxXAgSwfrFQ1c0iNda8AdejvUgnktOtJQQxmcfFzGglc5WVCj7oDgFqU18boeFSs52CUh8LE8BIVQDT1ABrB0HtgSEYlX5doJnCwv9TXocKCaKbnwhdDKPq4lf3SwU3HLq4V/+WYhHVMa/3b4IlfyikAduCkcBc7mQ3/z/Qq/cTuikhkzB12Ae/mcJC9U+Vo8Ej1gWAtgbeGgFsAMHr50BIWOLCbezvhpBFUdY6EJuJ/QDW0XoMX60zZ0AAAAASUVORK5CYII=";break}const n=e.append("g");n.attr("class","person-man");const h=Ne();switch(t.typeC4Shape.text){case"person":case"external_person":case"system":case"external_system":case"container":case"external_container":case"component":case"external_component":h.x=t.x,h.y=t.y,h.fill=l,h.width=t.width,h.height=t.height,h.stroke=i,h.rx=2.5,h.ry=2.5,h.attrs={"stroke-width":.5},ie(n,h);break;case"system_db":case"external_system_db":case"container_db":case"external_container_db":case"component_db":case"external_component_db":n.append("path").attr("fill",l).attr("stroke-width","0.5").attr("stroke",i).attr("d","Mstartx,startyc0,-10 half,-10 half,-10c0,0 half,0 half,10l0,heightc0,10 -half,10 -half,10c0,0 -half,0 -half,-10l0,-height".replaceAll("startx",t.x).replaceAll("starty",t.y).replaceAll("half",t.width/2).replaceAll("height",t.height)),n.append("path").attr("fill","none").attr("stroke-width","0.5").attr("stroke",i).attr("d","Mstartx,startyc0,10 half,10 half,10c0,0 half,0 half,-10".replaceAll("startx",t.x).replaceAll("starty",t.y).replaceAll("half",t.width/2));break;case"system_queue":case"external_system_queue":case"container_queue":case"external_container_queue":case"component_queue":case"external_component_queue":n.append("path").attr("fill",l).attr("stroke-width","0.5").attr("stroke",i).attr("d","Mstartx,startylwidth,0c5,0 5,half 5,halfc0,0 0,half -5,halfl-width,0c-5,0 -5,-half -5,-halfc0,0 0,-half 5,-half".replaceAll("startx",t.x).replaceAll("starty",t.y).replaceAll("width",t.width).replaceAll("half",t.height/2)),n.append("path").attr("fill","none").attr("stroke-width","0.5").attr("stroke",i).attr("d","Mstartx,startyc-5,0 -5,half -5,halfc0,half 5,half 5,half".replaceAll("startx",t.x+t.width).replaceAll("starty",t.y).replaceAll("half",t.height/2));break}let f=v0(a,t.typeC4Shape.text);switch(n.append("text").attr("fill",s).attr("font-family",f.fontFamily).attr("font-size",f.fontSize-2).attr("font-style","italic").attr("lengthAdjust","spacing").attr("textLength",t.typeC4Shape.width).attr("x",t.x+t.width/2-t.typeC4Shape.width/2).attr("y",t.y+t.typeC4Shape.Y).text("<<"+t.typeC4Shape.text+">>"),t.typeC4Shape.text){case"person":case"external_person":ye(n,48,48,t.x+t.width/2-24,t.y+t.image.Y,r);break}let d=a[t.typeC4Shape.text+"Font"]();return d.fontWeight="bold",d.fontSize=d.fontSize+2,d.fontColor=s,W(a)(t.label.text,n,t.x,t.y+t.label.Y,t.width,t.height,{fill:s},d),d=a[t.typeC4Shape.text+"Font"](),d.fontColor=s,t.techn&&((o=t.techn)==null?void 0:o.text)!==""?W(a)(t.techn.text,n,t.x,t.y+t.techn.Y,t.width,t.height,{fill:s,"font-style":"italic"},d):t.type&&t.type.text!==""&&W(a)(t.type.text,n,t.x,t.y+t.type.Y,t.width,t.height,{fill:s,"font-style":"italic"},d),t.descr&&t.descr.text!==""&&(d=a.personFont(),d.fontColor=s,W(a)(t.descr.text,n,t.x,t.y+t.descr.Y,t.width,t.height,{fill:s},d)),t.height},p0=function(e){e.append("defs").append("symbol").attr("id","database").attr("fill-rule","evenodd").attr("clip-rule","evenodd").append("path").attr("transform","scale(.5)").attr("d","M12.258.001l.256.004.255.005.253.008.251.01.249.012.247.015.246.016.242.019.241.02.239.023.236.024.233.027.231.028.229.031.225.032.223.034.22.036.217.038.214.04.211.041.208.043.205.045.201.046.198.048.194.05.191.051.187.053.183.054.18.056.175.057.172.059.168.06.163.061.16.063.155.064.15.066.074.033.073.033.071.034.07.034.069.035.068.035.067.035.066.035.064.036.064.036.062.036.06.036.06.037.058.037.058.037.055.038.055.038.053.038.052.038.051.039.05.039.048.039.047.039.045.04.044.04.043.04.041.04.04.041.039.041.037.041.036.041.034.041.033.042.032.042.03.042.029.042.027.042.026.043.024.043.023.043.021.043.02.043.018.044.017.043.015.044.013.044.012.044.011.045.009.044.007.045.006.045.004.045.002.045.001.045v17l-.001.045-.002.045-.004.045-.006.045-.007.045-.009.044-.011.045-.012.044-.013.044-.015.044-.017.043-.018.044-.02.043-.021.043-.023.043-.024.043-.026.043-.027.042-.029.042-.03.042-.032.042-.033.042-.034.041-.036.041-.037.041-.039.041-.04.041-.041.04-.043.04-.044.04-.045.04-.047.039-.048.039-.05.039-.051.039-.052.038-.053.038-.055.038-.055.038-.058.037-.058.037-.06.037-.06.036-.062.036-.064.036-.064.036-.066.035-.067.035-.068.035-.069.035-.07.034-.071.034-.073.033-.074.033-.15.066-.155.064-.16.063-.163.061-.168.06-.172.059-.175.057-.18.056-.183.054-.187.053-.191.051-.194.05-.198.048-.201.046-.205.045-.208.043-.211.041-.214.04-.217.038-.22.036-.223.034-.225.032-.229.031-.231.028-.233.027-.236.024-.239.023-.241.02-.242.019-.246.016-.247.015-.249.012-.251.01-.253.008-.255.005-.256.004-.258.001-.258-.001-.256-.004-.255-.005-.253-.008-.251-.01-.249-.012-.247-.015-.245-.016-.243-.019-.241-.02-.238-.023-.236-.024-.234-.027-.231-.028-.228-.031-.226-.032-.223-.034-.22-.036-.217-.038-.214-.04-.211-.041-.208-.043-.204-.045-.201-.046-.198-.048-.195-.05-.19-.051-.187-.053-.184-.054-.179-.056-.176-.057-.172-.059-.167-.06-.164-.061-.159-.063-.155-.064-.151-.066-.074-.033-.072-.033-.072-.034-.07-.034-.069-.035-.068-.035-.067-.035-.066-.035-.064-.036-.063-.036-.062-.036-.061-.036-.06-.037-.058-.037-.057-.037-.056-.038-.055-.038-.053-.038-.052-.038-.051-.039-.049-.039-.049-.039-.046-.039-.046-.04-.044-.04-.043-.04-.041-.04-.04-.041-.039-.041-.037-.041-.036-.041-.034-.041-.033-.042-.032-.042-.03-.042-.029-.042-.027-.042-.026-.043-.024-.043-.023-.043-.021-.043-.02-.043-.018-.044-.017-.043-.015-.044-.013-.044-.012-.044-.011-.045-.009-.044-.007-.045-.006-.045-.004-.045-.002-.045-.001-.045v-17l.001-.045.002-.045.004-.045.006-.045.007-.045.009-.044.011-.045.012-.044.013-.044.015-.044.017-.043.018-.044.02-.043.021-.043.023-.043.024-.043.026-.043.027-.042.029-.042.03-.042.032-.042.033-.042.034-.041.036-.041.037-.041.039-.041.04-.041.041-.04.043-.04.044-.04.046-.04.046-.039.049-.039.049-.039.051-.039.052-.038.053-.038.055-.038.056-.038.057-.037.058-.037.06-.037.061-.036.062-.036.063-.036.064-.036.066-.035.067-.035.068-.035.069-.035.07-.034.072-.034.072-.033.074-.033.151-.066.155-.064.159-.063.164-.061.167-.06.172-.059.176-.057.179-.056.184-.054.187-.053.19-.051.195-.05.198-.048.201-.046.204-.045.208-.043.211-.041.214-.04.217-.038.22-.036.223-.034.226-.032.228-.031.231-.028.234-.027.236-.024.238-.023.241-.02.243-.019.245-.016.247-.015.249-.012.251-.01.253-.008.255-.005.256-.004.258-.001.258.001zm-9.258 20.499v.01l.001.021.003.021.004.022.005.021.006.022.007.022.009.023.01.022.011.023.012.023.013.023.015.023.016.024.017.023.018.024.019.024.021.024.022.025.023.024.024.025.052.049.056.05.061.051.066.051.07.051.075.051.079.052.084.052.088.052.092.052.097.052.102.051.105.052.11.052.114.051.119.051.123.051.127.05.131.05.135.05.139.048.144.049.147.047.152.047.155.047.16.045.163.045.167.043.171.043.176.041.178.041.183.039.187.039.19.037.194.035.197.035.202.033.204.031.209.03.212.029.216.027.219.025.222.024.226.021.23.02.233.018.236.016.24.015.243.012.246.01.249.008.253.005.256.004.259.001.26-.001.257-.004.254-.005.25-.008.247-.011.244-.012.241-.014.237-.016.233-.018.231-.021.226-.021.224-.024.22-.026.216-.027.212-.028.21-.031.205-.031.202-.034.198-.034.194-.036.191-.037.187-.039.183-.04.179-.04.175-.042.172-.043.168-.044.163-.045.16-.046.155-.046.152-.047.148-.048.143-.049.139-.049.136-.05.131-.05.126-.05.123-.051.118-.052.114-.051.11-.052.106-.052.101-.052.096-.052.092-.052.088-.053.083-.051.079-.052.074-.052.07-.051.065-.051.06-.051.056-.05.051-.05.023-.024.023-.025.021-.024.02-.024.019-.024.018-.024.017-.024.015-.023.014-.024.013-.023.012-.023.01-.023.01-.022.008-.022.006-.022.006-.022.004-.022.004-.021.001-.021.001-.021v-4.127l-.077.055-.08.053-.083.054-.085.053-.087.052-.09.052-.093.051-.095.05-.097.05-.1.049-.102.049-.105.048-.106.047-.109.047-.111.046-.114.045-.115.045-.118.044-.12.043-.122.042-.124.042-.126.041-.128.04-.13.04-.132.038-.134.038-.135.037-.138.037-.139.035-.142.035-.143.034-.144.033-.147.032-.148.031-.15.03-.151.03-.153.029-.154.027-.156.027-.158.026-.159.025-.161.024-.162.023-.163.022-.165.021-.166.02-.167.019-.169.018-.169.017-.171.016-.173.015-.173.014-.175.013-.175.012-.177.011-.178.01-.179.008-.179.008-.181.006-.182.005-.182.004-.184.003-.184.002h-.37l-.184-.002-.184-.003-.182-.004-.182-.005-.181-.006-.179-.008-.179-.008-.178-.01-.176-.011-.176-.012-.175-.013-.173-.014-.172-.015-.171-.016-.17-.017-.169-.018-.167-.019-.166-.02-.165-.021-.163-.022-.162-.023-.161-.024-.159-.025-.157-.026-.156-.027-.155-.027-.153-.029-.151-.03-.15-.03-.148-.031-.146-.032-.145-.033-.143-.034-.141-.035-.14-.035-.137-.037-.136-.037-.134-.038-.132-.038-.13-.04-.128-.04-.126-.041-.124-.042-.122-.042-.12-.044-.117-.043-.116-.045-.113-.045-.112-.046-.109-.047-.106-.047-.105-.048-.102-.049-.1-.049-.097-.05-.095-.05-.093-.052-.09-.051-.087-.052-.085-.053-.083-.054-.08-.054-.077-.054v4.127zm0-5.654v.011l.001.021.003.021.004.021.005.022.006.022.007.022.009.022.01.022.011.023.012.023.013.023.015.024.016.023.017.024.018.024.019.024.021.024.022.024.023.025.024.024.052.05.056.05.061.05.066.051.07.051.075.052.079.051.084.052.088.052.092.052.097.052.102.052.105.052.11.051.114.051.119.052.123.05.127.051.131.05.135.049.139.049.144.048.147.048.152.047.155.046.16.045.163.045.167.044.171.042.176.042.178.04.183.04.187.038.19.037.194.036.197.034.202.033.204.032.209.03.212.028.216.027.219.025.222.024.226.022.23.02.233.018.236.016.24.014.243.012.246.01.249.008.253.006.256.003.259.001.26-.001.257-.003.254-.006.25-.008.247-.01.244-.012.241-.015.237-.016.233-.018.231-.02.226-.022.224-.024.22-.025.216-.027.212-.029.21-.03.205-.032.202-.033.198-.035.194-.036.191-.037.187-.039.183-.039.179-.041.175-.042.172-.043.168-.044.163-.045.16-.045.155-.047.152-.047.148-.048.143-.048.139-.05.136-.049.131-.05.126-.051.123-.051.118-.051.114-.052.11-.052.106-.052.101-.052.096-.052.092-.052.088-.052.083-.052.079-.052.074-.051.07-.052.065-.051.06-.05.056-.051.051-.049.023-.025.023-.024.021-.025.02-.024.019-.024.018-.024.017-.024.015-.023.014-.023.013-.024.012-.022.01-.023.01-.023.008-.022.006-.022.006-.022.004-.021.004-.022.001-.021.001-.021v-4.139l-.077.054-.08.054-.083.054-.085.052-.087.053-.09.051-.093.051-.095.051-.097.05-.1.049-.102.049-.105.048-.106.047-.109.047-.111.046-.114.045-.115.044-.118.044-.12.044-.122.042-.124.042-.126.041-.128.04-.13.039-.132.039-.134.038-.135.037-.138.036-.139.036-.142.035-.143.033-.144.033-.147.033-.148.031-.15.03-.151.03-.153.028-.154.028-.156.027-.158.026-.159.025-.161.024-.162.023-.163.022-.165.021-.166.02-.167.019-.169.018-.169.017-.171.016-.173.015-.173.014-.175.013-.175.012-.177.011-.178.009-.179.009-.179.007-.181.007-.182.005-.182.004-.184.003-.184.002h-.37l-.184-.002-.184-.003-.182-.004-.182-.005-.181-.007-.179-.007-.179-.009-.178-.009-.176-.011-.176-.012-.175-.013-.173-.014-.172-.015-.171-.016-.17-.017-.169-.018-.167-.019-.166-.02-.165-.021-.163-.022-.162-.023-.161-.024-.159-.025-.157-.026-.156-.027-.155-.028-.153-.028-.151-.03-.15-.03-.148-.031-.146-.033-.145-.033-.143-.033-.141-.035-.14-.036-.137-.036-.136-.037-.134-.038-.132-.039-.13-.039-.128-.04-.126-.041-.124-.042-.122-.043-.12-.043-.117-.044-.116-.044-.113-.046-.112-.046-.109-.046-.106-.047-.105-.048-.102-.049-.1-.049-.097-.05-.095-.051-.093-.051-.09-.051-.087-.053-.085-.052-.083-.054-.08-.054-.077-.054v4.139zm0-5.666v.011l.001.02.003.022.004.021.005.022.006.021.007.022.009.023.01.022.011.023.012.023.013.023.015.023.016.024.017.024.018.023.019.024.021.025.022.024.023.024.024.025.052.05.056.05.061.05.066.051.07.051.075.052.079.051.084.052.088.052.092.052.097.052.102.052.105.051.11.052.114.051.119.051.123.051.127.05.131.05.135.05.139.049.144.048.147.048.152.047.155.046.16.045.163.045.167.043.171.043.176.042.178.04.183.04.187.038.19.037.194.036.197.034.202.033.204.032.209.03.212.028.216.027.219.025.222.024.226.021.23.02.233.018.236.017.24.014.243.012.246.01.249.008.253.006.256.003.259.001.26-.001.257-.003.254-.006.25-.008.247-.01.244-.013.241-.014.237-.016.233-.018.231-.02.226-.022.224-.024.22-.025.216-.027.212-.029.21-.03.205-.032.202-.033.198-.035.194-.036.191-.037.187-.039.183-.039.179-.041.175-.042.172-.043.168-.044.163-.045.16-.045.155-.047.152-.047.148-.048.143-.049.139-.049.136-.049.131-.051.126-.05.123-.051.118-.052.114-.051.11-.052.106-.052.101-.052.096-.052.092-.052.088-.052.083-.052.079-.052.074-.052.07-.051.065-.051.06-.051.056-.05.051-.049.023-.025.023-.025.021-.024.02-.024.019-.024.018-.024.017-.024.015-.023.014-.024.013-.023.012-.023.01-.022.01-.023.008-.022.006-.022.006-.022.004-.022.004-.021.001-.021.001-.021v-4.153l-.077.054-.08.054-.083.053-.085.053-.087.053-.09.051-.093.051-.095.051-.097.05-.1.049-.102.048-.105.048-.106.048-.109.046-.111.046-.114.046-.115.044-.118.044-.12.043-.122.043-.124.042-.126.041-.128.04-.13.039-.132.039-.134.038-.135.037-.138.036-.139.036-.142.034-.143.034-.144.033-.147.032-.148.032-.15.03-.151.03-.153.028-.154.028-.156.027-.158.026-.159.024-.161.024-.162.023-.163.023-.165.021-.166.02-.167.019-.169.018-.169.017-.171.016-.173.015-.173.014-.175.013-.175.012-.177.01-.178.01-.179.009-.179.007-.181.006-.182.006-.182.004-.184.003-.184.001-.185.001-.185-.001-.184-.001-.184-.003-.182-.004-.182-.006-.181-.006-.179-.007-.179-.009-.178-.01-.176-.01-.176-.012-.175-.013-.173-.014-.172-.015-.171-.016-.17-.017-.169-.018-.167-.019-.166-.02-.165-.021-.163-.023-.162-.023-.161-.024-.159-.024-.157-.026-.156-.027-.155-.028-.153-.028-.151-.03-.15-.03-.148-.032-.146-.032-.145-.033-.143-.034-.141-.034-.14-.036-.137-.036-.136-.037-.134-.038-.132-.039-.13-.039-.128-.041-.126-.041-.124-.041-.122-.043-.12-.043-.117-.044-.116-.044-.113-.046-.112-.046-.109-.046-.106-.048-.105-.048-.102-.048-.1-.05-.097-.049-.095-.051-.093-.051-.09-.052-.087-.052-.085-.053-.083-.053-.08-.054-.077-.054v4.153zm8.74-8.179l-.257.004-.254.005-.25.008-.247.011-.244.012-.241.014-.237.016-.233.018-.231.021-.226.022-.224.023-.22.026-.216.027-.212.028-.21.031-.205.032-.202.033-.198.034-.194.036-.191.038-.187.038-.183.04-.179.041-.175.042-.172.043-.168.043-.163.045-.16.046-.155.046-.152.048-.148.048-.143.048-.139.049-.136.05-.131.05-.126.051-.123.051-.118.051-.114.052-.11.052-.106.052-.101.052-.096.052-.092.052-.088.052-.083.052-.079.052-.074.051-.07.052-.065.051-.06.05-.056.05-.051.05-.023.025-.023.024-.021.024-.02.025-.019.024-.018.024-.017.023-.015.024-.014.023-.013.023-.012.023-.01.023-.01.022-.008.022-.006.023-.006.021-.004.022-.004.021-.001.021-.001.021.001.021.001.021.004.021.004.022.006.021.006.023.008.022.01.022.01.023.012.023.013.023.014.023.015.024.017.023.018.024.019.024.02.025.021.024.023.024.023.025.051.05.056.05.06.05.065.051.07.052.074.051.079.052.083.052.088.052.092.052.096.052.101.052.106.052.11.052.114.052.118.051.123.051.126.051.131.05.136.05.139.049.143.048.148.048.152.048.155.046.16.046.163.045.168.043.172.043.175.042.179.041.183.04.187.038.191.038.194.036.198.034.202.033.205.032.21.031.212.028.216.027.22.026.224.023.226.022.231.021.233.018.237.016.241.014.244.012.247.011.25.008.254.005.257.004.26.001.26-.001.257-.004.254-.005.25-.008.247-.011.244-.012.241-.014.237-.016.233-.018.231-.021.226-.022.224-.023.22-.026.216-.027.212-.028.21-.031.205-.032.202-.033.198-.034.194-.036.191-.038.187-.038.183-.04.179-.041.175-.042.172-.043.168-.043.163-.045.16-.046.155-.046.152-.048.148-.048.143-.048.139-.049.136-.05.131-.05.126-.051.123-.051.118-.051.114-.052.11-.052.106-.052.101-.052.096-.052.092-.052.088-.052.083-.052.079-.052.074-.051.07-.052.065-.051.06-.05.056-.05.051-.05.023-.025.023-.024.021-.024.02-.025.019-.024.018-.024.017-.023.015-.024.014-.023.013-.023.012-.023.01-.023.01-.022.008-.022.006-.023.006-.021.004-.022.004-.021.001-.021.001-.021-.001-.021-.001-.021-.004-.021-.004-.022-.006-.021-.006-.023-.008-.022-.01-.022-.01-.023-.012-.023-.013-.023-.014-.023-.015-.024-.017-.023-.018-.024-.019-.024-.02-.025-.021-.024-.023-.024-.023-.025-.051-.05-.056-.05-.06-.05-.065-.051-.07-.052-.074-.051-.079-.052-.083-.052-.088-.052-.092-.052-.096-.052-.101-.052-.106-.052-.11-.052-.114-.052-.118-.051-.123-.051-.126-.051-.131-.05-.136-.05-.139-.049-.143-.048-.148-.048-.152-.048-.155-.046-.16-.046-.163-.045-.168-.043-.172-.043-.175-.042-.179-.041-.183-.04-.187-.038-.191-.038-.194-.036-.198-.034-.202-.033-.205-.032-.21-.031-.212-.028-.216-.027-.22-.026-.224-.023-.226-.022-.231-.021-.233-.018-.237-.016-.241-.014-.244-.012-.247-.011-.25-.008-.254-.005-.257-.004-.26-.001-.26.001z")},y0=function(e){e.append("defs").append("symbol").attr("id","computer").attr("width","24").attr("height","24").append("path").attr("transform","scale(.5)").attr("d","M2 2v13h20v-13h-20zm18 11h-16v-9h16v9zm-10.228 6l.466-1h3.524l.467 1h-4.457zm14.228 3h-24l2-6h2.104l-1.33 4h18.45l-1.297-4h2.073l2 6zm-5-10h-14v-7h14v7z")},g0=function(e){e.append("defs").append("symbol").attr("id","clock").attr("width","24").attr("height","24").append("path").attr("transform","scale(.5)").attr("d","M12 2c5.514 0 10 4.486 10 10s-4.486 10-10 10-10-4.486-10-10 4.486-10 10-10zm0-2c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm5.848 12.459c.202.038.202.333.001.372-1.907.361-6.045 1.111-6.547 1.111-.719 0-1.301-.582-1.301-1.301 0-.512.77-5.447 1.125-7.445.034-.192.312-.181.343.014l.985 6.238 5.394 1.011z")},b0=function(e){e.append("defs").append("marker").attr("id","arrowhead").attr("refX",9).attr("refY",5).attr("markerUnits","userSpaceOnUse").attr("markerWidth",12).attr("markerHeight",12).attr("orient","auto").append("path").attr("d","M 0 0 L 10 5 L 0 10 z")},_0=function(e){e.append("defs").append("marker").attr("id","arrowend").attr("refX",1).attr("refY",5).attr("markerUnits","userSpaceOnUse").attr("markerWidth",12).attr("markerHeight",12).attr("orient","auto").append("path").attr("d","M 10 0 L 0 5 L 10 10 z")},x0=function(e){e.append("defs").append("marker").attr("id","filled-head").attr("refX",18).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L14,7 L9,1 Z")},m0=function(e){e.append("defs").append("marker").attr("id","sequencenumber").attr("refX",15).attr("refY",15).attr("markerWidth",60).attr("markerHeight",40).attr("orient","auto").append("circle").attr("cx",15).attr("cy",15).attr("r",6)},E0=function(e){const a=e.append("defs").append("marker").attr("id","crosshead").attr("markerWidth",15).attr("markerHeight",8).attr("orient","auto").attr("refX",16).attr("refY",4);a.append("path").attr("fill","black").attr("stroke","#000000").style("stroke-dasharray","0, 0").attr("stroke-width","1px").attr("d","M 9,2 V 6 L16,4 Z"),a.append("path").attr("fill","none").attr("stroke","#000000").style("stroke-dasharray","0, 0").attr("stroke-width","1px").attr("d","M 0,1 L 6,7 M 6,1 L 0,7")},v0=(e,t)=>({fontFamily:e[t+"FontFamily"],fontSize:e[t+"FontSize"],fontWeight:e[t+"FontWeight"]}),W=function(){function e(l,i,s,r,n,h,f){const d=i.append("text").attr("x",s+n/2).attr("y",r+h/2+5).style("text-anchor","middle").text(l);o(d,f)}function t(l,i,s,r,n,h,f,d){const{fontSize:p,fontFamily:E,fontWeight:O}=d,R=l.split(Kt.lineBreakRegex);for(let S=0;S<R.length;S++){const L=S*p-p*(R.length-1)/2,Y=i.append("text").attr("x",s+n/2).attr("y",r).style("text-anchor","middle").attr("dominant-baseline","middle").style("font-size",p).style("font-weight",O).style("font-family",E);Y.append("tspan").attr("dy",L).text(R[S]).attr("alignment-baseline","mathematical"),o(Y,f)}}function a(l,i,s,r,n,h,f,d){const p=i.append("switch"),O=p.append("foreignObject").attr("x",s).attr("y",r).attr("width",n).attr("height",h).append("xhtml:div").style("display","table").style("height","100%").style("width","100%");O.append("div").style("display","table-cell").style("text-align","center").style("vertical-align","middle").text(l),t(l,p,s,r,n,h,f,d),o(O,f)}function o(l,i){for(const s in i)i.hasOwnProperty(s)&&l.attr(s,i[s])}return function(l){return l.textPlacement==="fo"?a:l.textPlacement==="old"?e:t}}(),F={drawRect:ie,drawBoundary:d0,drawC4Shape:f0,drawRels:u0,drawImage:ye,insertArrowHead:b0,insertArrowEnd:_0,insertArrowFilledHead:x0,insertDynamicNumber:m0,insertArrowCrossHead:E0,insertDatabaseIcon:p0,insertComputerIcon:y0,insertClockIcon:g0};let Ut=0,Ft=0,ge=4,Zt=2;Yt.yy=Jt;let b={};class be{constructor(t){this.name="",this.data={},this.data.startx=void 0,this.data.stopx=void 0,this.data.starty=void 0,this.data.stopy=void 0,this.data.widthLimit=void 0,this.nextData={},this.nextData.startx=void 0,this.nextData.stopx=void 0,this.nextData.starty=void 0,this.nextData.stopy=void 0,this.nextData.cnt=0,$t(t.db.getConfig())}setData(t,a,o,l){this.nextData.startx=this.data.startx=t,this.nextData.stopx=this.data.stopx=a,this.nextData.starty=this.data.starty=o,this.nextData.stopy=this.data.stopy=l}updateVal(t,a,o,l){t[a]===void 0?t[a]=o:t[a]=l(o,t[a])}insert(t){this.nextData.cnt=this.nextData.cnt+1;let a=this.nextData.startx===this.nextData.stopx?this.nextData.stopx+t.margin:this.nextData.stopx+t.margin*2,o=a+t.width,l=this.nextData.starty+t.margin*2,i=l+t.height;(a>=this.data.widthLimit||o>=this.data.widthLimit||this.nextData.cnt>ge)&&(a=this.nextData.startx+t.margin+b.nextLinePaddingX,l=this.nextData.stopy+t.margin*2,this.nextData.stopx=o=a+t.width,this.nextData.starty=this.nextData.stopy,this.nextData.stopy=i=l+t.height,this.nextData.cnt=1),t.x=a,t.y=l,this.updateVal(this.data,"startx",a,Math.min),this.updateVal(this.data,"starty",l,Math.min),this.updateVal(this.data,"stopx",o,Math.max),this.updateVal(this.data,"stopy",i,Math.max),this.updateVal(this.nextData,"startx",a,Math.min),this.updateVal(this.nextData,"starty",l,Math.min),this.updateVal(this.nextData,"stopx",o,Math.max),this.updateVal(this.nextData,"stopy",i,Math.max)}init(t){this.name="",this.data={startx:void 0,stopx:void 0,starty:void 0,stopy:void 0,widthLimit:void 0},this.nextData={startx:void 0,stopx:void 0,starty:void 0,stopy:void 0,cnt:0},$t(t.db.getConfig())}bumpLastMargin(t){this.data.stopx+=t,this.data.stopy+=t}}const $t=function(e){Se(b,e),e.fontFamily&&(b.personFontFamily=b.systemFontFamily=b.messageFontFamily=e.fontFamily),e.fontSize&&(b.personFontSize=b.systemFontSize=b.messageFontSize=e.fontSize),e.fontWeight&&(b.personFontWeight=b.systemFontWeight=b.messageFontWeight=e.fontWeight)},Rt=(e,t)=>({fontFamily:e[t+"FontFamily"],fontSize:e[t+"FontSize"],fontWeight:e[t+"FontWeight"]}),Bt=e=>({fontFamily:e.boundaryFontFamily,fontSize:e.boundaryFontSize,fontWeight:e.boundaryFontWeight}),k0=e=>({fontFamily:e.messageFontFamily,fontSize:e.messageFontSize,fontWeight:e.messageFontWeight});function I(e,t,a,o,l){if(!t[e].width)if(a)t[e].text=Me(t[e].text,l,o),t[e].textLines=t[e].text.split(Kt.lineBreakRegex).length,t[e].width=l,t[e].height=oe(t[e].text,o);else{let i=t[e].text.split(Kt.lineBreakRegex);t[e].textLines=i.length;let s=0;t[e].height=0,t[e].width=0;for(const r of i)t[e].width=Math.max(wt(r,o),t[e].width),s=oe(r,o),t[e].height=t[e].height+s}}const _e=function(e,t,a){t.x=a.data.startx,t.y=a.data.starty,t.width=a.data.stopx-a.data.startx,t.height=a.data.stopy-a.data.starty,t.label.y=b.c4ShapeMargin-35;let o=t.wrap&&b.wrap,l=Bt(b);l.fontSize=l.fontSize+2,l.fontWeight="bold";let i=wt(t.label.text,l);I("label",t,o,l,i),F.drawBoundary(e,t,b)},xe=function(e,t,a,o){let l=0;for(const i of o){l=0;const s=a[i];let r=Rt(b,s.typeC4Shape.text);switch(r.fontSize=r.fontSize-2,s.typeC4Shape.width=wt("«"+s.typeC4Shape.text+"»",r),s.typeC4Shape.height=r.fontSize+2,s.typeC4Shape.Y=b.c4ShapePadding,l=s.typeC4Shape.Y+s.typeC4Shape.height-4,s.image={width:0,height:0,Y:0},s.typeC4Shape.text){case"person":case"external_person":s.image.width=48,s.image.height=48,s.image.Y=l,l=s.image.Y+s.image.height;break}s.sprite&&(s.image.width=48,s.image.height=48,s.image.Y=l,l=s.image.Y+s.image.height);let n=s.wrap&&b.wrap,h=b.width-b.c4ShapePadding*2,f=Rt(b,s.typeC4Shape.text);if(f.fontSize=f.fontSize+2,f.fontWeight="bold",I("label",s,n,f,h),s.label.Y=l+8,l=s.label.Y+s.label.height,s.type&&s.type.text!==""){s.type.text="["+s.type.text+"]";let E=Rt(b,s.typeC4Shape.text);I("type",s,n,E,h),s.type.Y=l+5,l=s.type.Y+s.type.height}else if(s.techn&&s.techn.text!==""){s.techn.text="["+s.techn.text+"]";let E=Rt(b,s.techn.text);I("techn",s,n,E,h),s.techn.Y=l+5,l=s.techn.Y+s.techn.height}let d=l,p=s.label.width;if(s.descr&&s.descr.text!==""){let E=Rt(b,s.typeC4Shape.text);I("descr",s,n,E,h),s.descr.Y=l+20,l=s.descr.Y+s.descr.height,p=Math.max(s.label.width,s.descr.width),d=l-s.descr.textLines*5}p=p+b.c4ShapePadding,s.width=Math.max(s.width||b.width,p,b.width),s.height=Math.max(s.height||b.height,d,b.height),s.margin=s.margin||b.c4ShapeMargin,e.insert(s),F.drawC4Shape(t,s,b)}e.bumpLastMargin(b.c4ShapeMargin)};class B{constructor(t,a){this.x=t,this.y=a}}let ce=function(e,t){let a=e.x,o=e.y,l=t.x,i=t.y,s=a+e.width/2,r=o+e.height/2,n=Math.abs(a-l),h=Math.abs(o-i),f=h/n,d=e.height/e.width,p=null;return o==i&&a<l?p=new B(a+e.width,r):o==i&&a>l?p=new B(a,r):a==l&&o<i?p=new B(s,o+e.height):a==l&&o>i&&(p=new B(s,o)),a>l&&o<i?d>=f?p=new B(a,r+f*e.width/2):p=new B(s-n/h*e.height/2,o+e.height):a<l&&o<i?d>=f?p=new B(a+e.width,r+f*e.width/2):p=new B(s+n/h*e.height/2,o+e.height):a<l&&o>i?d>=f?p=new B(a+e.width,r-f*e.width/2):p=new B(s+e.height/2*n/h,o):a>l&&o>i&&(d>=f?p=new B(a,r-e.width/2*f):p=new B(s-e.height/2*n/h,o)),p},A0=function(e,t){let a={x:0,y:0};a.x=t.x+t.width/2,a.y=t.y+t.height/2;let o=ce(e,a);a.x=e.x+e.width/2,a.y=e.y+e.height/2;let l=ce(t,a);return{startPoint:o,endPoint:l}};const C0=function(e,t,a,o){let l=0;for(let i of t){l=l+1;let s=i.wrap&&b.wrap,r=k0(b);o.db.getC4Type()==="C4Dynamic"&&(i.label.text=l+": "+i.label.text);let h=wt(i.label.text,r);I("label",i,s,r,h),i.techn&&i.techn.text!==""&&(h=wt(i.techn.text,r),I("techn",i,s,r,h)),i.descr&&i.descr.text!==""&&(h=wt(i.descr.text,r),I("descr",i,s,r,h));let f=a(i.from),d=a(i.to),p=A0(f,d);i.startPoint=p.startPoint,i.endPoint=p.endPoint}F.drawRels(e,t,b)};function me(e,t,a,o,l){let i=new be(l);i.data.widthLimit=a.data.widthLimit/Math.min(Zt,o.length);for(let[s,r]of o.entries()){let n=0;r.image={width:0,height:0,Y:0},r.sprite&&(r.image.width=48,r.image.height=48,r.image.Y=n,n=r.image.Y+r.image.height);let h=r.wrap&&b.wrap,f=Bt(b);if(f.fontSize=f.fontSize+2,f.fontWeight="bold",I("label",r,h,f,i.data.widthLimit),r.label.Y=n+8,n=r.label.Y+r.label.height,r.type&&r.type.text!==""){r.type.text="["+r.type.text+"]";let O=Bt(b);I("type",r,h,O,i.data.widthLimit),r.type.Y=n+5,n=r.type.Y+r.type.height}if(r.descr&&r.descr.text!==""){let O=Bt(b);O.fontSize=O.fontSize-2,I("descr",r,h,O,i.data.widthLimit),r.descr.Y=n+20,n=r.descr.Y+r.descr.height}if(s==0||s%Zt===0){let O=a.data.startx+b.diagramMarginX,R=a.data.stopy+b.diagramMarginY+n;i.setData(O,O,R,R)}else{let O=i.data.stopx!==i.data.startx?i.data.stopx+b.diagramMarginX:i.data.startx,R=i.data.starty;i.setData(O,O,R,R)}i.name=r.alias;let d=l.db.getC4ShapeArray(r.alias),p=l.db.getC4ShapeKeys(r.alias);p.length>0&&xe(i,e,d,p),t=r.alias;let E=l.db.getBoundarys(t);E.length>0&&me(e,t,i,E,l),r.alias!=="global"&&_e(e,r,i),a.data.stopy=Math.max(i.data.stopy+b.c4ShapeMargin,a.data.stopy),a.data.stopx=Math.max(i.data.stopx+b.c4ShapeMargin,a.data.stopx),Ut=Math.max(Ut,a.data.stopx),Ft=Math.max(Ft,a.data.stopy)}}const w0=function(e,t,a,o){b=Dt().c4;const l=Dt().securityLevel;let i;l==="sandbox"&&(i=Nt("#i"+t));const s=l==="sandbox"?Nt(i.nodes()[0].contentDocument.body):Nt("body");let r=o.db;o.db.setWrap(b.wrap),ge=r.getC4ShapeInRow(),Zt=r.getC4BoundaryInRow(),le.debug(`C:${JSON.stringify(b,null,2)}`);const n=l==="sandbox"?s.select(`[id="${t}"]`):Nt(`[id="${t}"]`);F.insertComputerIcon(n),F.insertDatabaseIcon(n),F.insertClockIcon(n);let h=new be(o);h.setData(b.diagramMarginX,b.diagramMarginX,b.diagramMarginY,b.diagramMarginY),h.data.widthLimit=screen.availWidth,Ut=b.diagramMarginX,Ft=b.diagramMarginY;const f=o.db.getTitle();let d=o.db.getBoundarys("");me(n,"",h,d,o),F.insertArrowHead(n),F.insertArrowEnd(n),F.insertArrowCrossHead(n),F.insertArrowFilledHead(n),C0(n,o.db.getRels(),o.db.getC4Shape,o),h.data.stopx=Ut,h.data.stopy=Ft;const p=h.data;let O=p.stopy-p.starty+2*b.diagramMarginY;const S=p.stopx-p.startx+2*b.diagramMarginX;f&&n.append("text").text(f).attr("x",(p.stopx-p.startx)/2-4*b.diagramMarginX).attr("y",p.starty+b.diagramMarginY),De(n,O,S,b.useMaxWidth);const L=f?60:0;n.attr("viewBox",p.startx-b.diagramMarginX+" -"+(b.diagramMarginY+L)+" "+S+" "+(O+L)),le.debug("models:",p)},he={drawPersonOrSystemArray:xe,drawBoundary:_e,setConf:$t,draw:w0},O0=e=>`.person {
7
+ stroke: ${e.personBorder};
8
+ fill: ${e.personBkg};
9
+ }
10
+ `,T0=O0,S0={parser:Be,db:Jt,renderer:he,styles:T0,init:({c4:e,wrap:t})=>{he.setConf(e),Jt.setWrap(t)}};export{S0 as diagram};
frontend-dist/assets/channel-DsKT-zfZ.js ADDED
@@ -0,0 +1 @@
 
 
1
+ import{aH as o,aI as n}from"./index-BCNM9-Ly.js";const t=(a,r)=>o.lang.round(n.parse(a)[r]);export{t as c};
frontend-dist/assets/classDiagram-beda092f-wmkRqnN2.js ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import{s as A,d as S,p as G}from"./styles-b4e223ce-CtHeUc7h.js";import{c as v,l as y,d as B,e as W,F as $,A as M,G as I}from"./index-BCNM9-Ly.js";import{G as O}from"./graph-CY8eBbAS.js";import{l as P}from"./layout-CUwpW5wl.js";import{l as X}from"./line-DdWeXrJe.js";import"./array-BKyUJesY.js";import"./path-CbwjOpE9.js";let H=0;const Y=function(i,a,t,o,p){const g=function(e){switch(e){case p.db.relationType.AGGREGATION:return"aggregation";case p.db.relationType.EXTENSION:return"extension";case p.db.relationType.COMPOSITION:return"composition";case p.db.relationType.DEPENDENCY:return"dependency";case p.db.relationType.LOLLIPOP:return"lollipop"}};a.points=a.points.filter(e=>!Number.isNaN(e.y));const s=a.points,c=X().x(function(e){return e.x}).y(function(e){return e.y}).curve($),n=i.append("path").attr("d",c(s)).attr("id","edge"+H).attr("class","relation");let r="";o.arrowMarkerAbsolute&&(r=window.location.protocol+"//"+window.location.host+window.location.pathname+window.location.search,r=r.replace(/\(/g,"\\("),r=r.replace(/\)/g,"\\)")),t.relation.lineType==1&&n.attr("class","relation dashed-line"),t.relation.lineType==10&&n.attr("class","relation dotted-line"),t.relation.type1!=="none"&&n.attr("marker-start","url("+r+"#"+g(t.relation.type1)+"Start)"),t.relation.type2!=="none"&&n.attr("marker-end","url("+r+"#"+g(t.relation.type2)+"End)");let f,h;const x=a.points.length;let b=M.calcLabelPosition(a.points);f=b.x,h=b.y;let u,m,w,k;if(x%2!==0&&x>1){let e=M.calcCardinalityPosition(t.relation.type1!=="none",a.points,a.points[0]),d=M.calcCardinalityPosition(t.relation.type2!=="none",a.points,a.points[x-1]);y.debug("cardinality_1_point "+JSON.stringify(e)),y.debug("cardinality_2_point "+JSON.stringify(d)),u=e.x,m=e.y,w=d.x,k=d.y}if(t.title!==void 0){const e=i.append("g").attr("class","classLabel"),d=e.append("text").attr("class","label").attr("x",f).attr("y",h).attr("fill","red").attr("text-anchor","middle").text(t.title);window.label=d;const l=d.node().getBBox();e.insert("rect",":first-child").attr("class","box").attr("x",l.x-o.padding/2).attr("y",l.y-o.padding/2).attr("width",l.width+o.padding).attr("height",l.height+o.padding)}y.info("Rendering relation "+JSON.stringify(t)),t.relationTitle1!==void 0&&t.relationTitle1!=="none"&&i.append("g").attr("class","cardinality").append("text").attr("class","type1").attr("x",u).attr("y",m).attr("fill","black").attr("font-size","6").text(t.relationTitle1),t.relationTitle2!==void 0&&t.relationTitle2!=="none"&&i.append("g").attr("class","cardinality").append("text").attr("class","type2").attr("x",w).attr("y",k).attr("fill","black").attr("font-size","6").text(t.relationTitle2),H++},J=function(i,a,t,o){y.debug("Rendering class ",a,t);const p=a.id,g={id:p,label:a.id,width:0,height:0},s=i.append("g").attr("id",o.db.lookUpDomId(p)).attr("class","classGroup");let c;a.link?c=s.append("svg:a").attr("xlink:href",a.link).attr("target",a.linkTarget).append("text").attr("y",t.textHeight+t.padding).attr("x",0):c=s.append("text").attr("y",t.textHeight+t.padding).attr("x",0);let n=!0;a.annotations.forEach(function(d){const l=c.append("tspan").text("«"+d+"»");n||l.attr("dy",t.textHeight),n=!1});let r=C(a);const f=c.append("tspan").text(r).attr("class","title");n||f.attr("dy",t.textHeight);const h=c.node().getBBox().height;let x,b,u;if(a.members.length>0){x=s.append("line").attr("x1",0).attr("y1",t.padding+h+t.dividerMargin/2).attr("y2",t.padding+h+t.dividerMargin/2);const d=s.append("text").attr("x",t.padding).attr("y",h+t.dividerMargin+t.textHeight).attr("fill","white").attr("class","classText");n=!0,a.members.forEach(function(l){_(d,l,n,t),n=!1}),b=d.node().getBBox()}if(a.methods.length>0){u=s.append("line").attr("x1",0).attr("y1",t.padding+h+t.dividerMargin+b.height).attr("y2",t.padding+h+t.dividerMargin+b.height);const d=s.append("text").attr("x",t.padding).attr("y",h+2*t.dividerMargin+b.height+t.textHeight).attr("fill","white").attr("class","classText");n=!0,a.methods.forEach(function(l){_(d,l,n,t),n=!1})}const m=s.node().getBBox();var w=" ";a.cssClasses.length>0&&(w=w+a.cssClasses.join(" "));const e=s.insert("rect",":first-child").attr("x",0).attr("y",0).attr("width",m.width+2*t.padding).attr("height",m.height+t.padding+.5*t.dividerMargin).attr("class",w).node().getBBox().width;return c.node().childNodes.forEach(function(d){d.setAttribute("x",(e-d.getBBox().width)/2)}),a.tooltip&&c.insert("title").text(a.tooltip),x&&x.attr("x2",e),u&&u.attr("x2",e),g.width=e,g.height=m.height+t.padding+.5*t.dividerMargin,g},C=function(i){let a=i.id;return i.type&&(a+="<"+I(i.type)+">"),a},Z=function(i,a,t,o){y.debug("Rendering note ",a,t);const p=a.id,g={id:p,text:a.text,width:0,height:0},s=i.append("g").attr("id",p).attr("class","classGroup");let c=s.append("text").attr("y",t.textHeight+t.padding).attr("x",0);const n=JSON.parse(`"${a.text}"`).split(`
2
+ `);n.forEach(function(x){y.debug(`Adding line: ${x}`),c.append("tspan").text(x).attr("class","title").attr("dy",t.textHeight)});const r=s.node().getBBox(),h=s.insert("rect",":first-child").attr("x",0).attr("y",0).attr("width",r.width+2*t.padding).attr("height",r.height+n.length*t.textHeight+t.padding+.5*t.dividerMargin).node().getBBox().width;return c.node().childNodes.forEach(function(x){x.setAttribute("x",(h-x.getBBox().width)/2)}),g.width=h,g.height=r.height+n.length*t.textHeight+t.padding+.5*t.dividerMargin,g},_=function(i,a,t,o){const{displayText:p,cssStyle:g}=a.getDisplayDetails(),s=i.append("tspan").attr("x",o.padding).text(p);g!==""&&s.attr("style",a.cssStyle),t||s.attr("dy",o.textHeight)},N={getClassTitleString:C,drawClass:J,drawEdge:Y,drawNote:Z};let T={};const E=20,L=function(i){const a=Object.entries(T).find(t=>t[1].label===i);if(a)return a[0]},R=function(i){i.append("defs").append("marker").attr("id","extensionStart").attr("class","extension").attr("refX",0).attr("refY",7).attr("markerWidth",190).attr("markerHeight",240).attr("orient","auto").append("path").attr("d","M 1,7 L18,13 V 1 Z"),i.append("defs").append("marker").attr("id","extensionEnd").attr("refX",19).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 1,1 V 13 L18,7 Z"),i.append("defs").append("marker").attr("id","compositionStart").attr("class","extension").attr("refX",0).attr("refY",7).attr("markerWidth",190).attr("markerHeight",240).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","compositionEnd").attr("refX",19).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","aggregationStart").attr("class","extension").attr("refX",0).attr("refY",7).attr("markerWidth",190).attr("markerHeight",240).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","aggregationEnd").attr("refX",19).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","dependencyStart").attr("class","extension").attr("refX",0).attr("refY",7).attr("markerWidth",190).attr("markerHeight",240).attr("orient","auto").append("path").attr("d","M 5,7 L9,13 L1,7 L9,1 Z"),i.append("defs").append("marker").attr("id","dependencyEnd").attr("refX",19).attr("refY",7).attr("markerWidth",20).attr("markerHeight",28).attr("orient","auto").append("path").attr("d","M 18,7 L9,13 L14,7 L9,1 Z")},F=function(i,a,t,o){const p=v().class;T={},y.info("Rendering diagram "+i);const g=v().securityLevel;let s;g==="sandbox"&&(s=B("#i"+a));const c=g==="sandbox"?B(s.nodes()[0].contentDocument.body):B("body"),n=c.select(`[id='${a}']`);R(n);const r=new O({multigraph:!0});r.setGraph({isMultiGraph:!0}),r.setDefaultEdgeLabel(function(){return{}});const f=o.db.getClasses(),h=Object.keys(f);for(const e of h){const d=f[e],l=N.drawClass(n,d,p,o);T[l.id]=l,r.setNode(l.id,l),y.info("Org height: "+l.height)}o.db.getRelations().forEach(function(e){y.info("tjoho"+L(e.id1)+L(e.id2)+JSON.stringify(e)),r.setEdge(L(e.id1),L(e.id2),{relation:e},e.title||"DEFAULT")}),o.db.getNotes().forEach(function(e){y.debug(`Adding note: ${JSON.stringify(e)}`);const d=N.drawNote(n,e,p,o);T[d.id]=d,r.setNode(d.id,d),e.class&&e.class in f&&r.setEdge(e.id,L(e.class),{relation:{id1:e.id,id2:e.class,relation:{type1:"none",type2:"none",lineType:10}}},"DEFAULT")}),P(r),r.nodes().forEach(function(e){e!==void 0&&r.node(e)!==void 0&&(y.debug("Node "+e+": "+JSON.stringify(r.node(e))),c.select("#"+(o.db.lookUpDomId(e)||e)).attr("transform","translate("+(r.node(e).x-r.node(e).width/2)+","+(r.node(e).y-r.node(e).height/2)+" )"))}),r.edges().forEach(function(e){e!==void 0&&r.edge(e)!==void 0&&(y.debug("Edge "+e.v+" -> "+e.w+": "+JSON.stringify(r.edge(e))),N.drawEdge(n,r.edge(e),r.edge(e).relation,p,o))});const u=n.node().getBBox(),m=u.width+E*2,w=u.height+E*2;W(n,w,m,p.useMaxWidth);const k=`${u.x-E} ${u.y-E} ${m} ${w}`;y.debug(`viewBox ${k}`),n.attr("viewBox",k)},U={draw:F},tt={parser:G,db:S,renderer:U,styles:A,init:i=>{i.class||(i.class={}),i.class.arrowMarkerAbsolute=i.arrowMarkerAbsolute,S.clear()}};export{tt as diagram};