Nx-Neuralon commited on
Commit
b6d0232
·
verified ·
1 Parent(s): ec43701

Upload 64 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app/__init__.py +1 -0
  2. app/__pycache__/__init__.cpython-310.pyc +0 -0
  3. app/__pycache__/__init__.cpython-38.pyc +0 -0
  4. app/__pycache__/aggregator.cpython-310.pyc +0 -0
  5. app/__pycache__/aggregator.cpython-38.pyc +0 -0
  6. app/__pycache__/audio_utils.cpython-310.pyc +0 -0
  7. app/__pycache__/config.cpython-310.pyc +0 -0
  8. app/__pycache__/config.cpython-38.pyc +0 -0
  9. app/__pycache__/document_utils.cpython-310.pyc +0 -0
  10. app/__pycache__/evidence_builder.cpython-310.pyc +0 -0
  11. app/__pycache__/file_utils.cpython-310.pyc +0 -0
  12. app/__pycache__/file_utils.cpython-38.pyc +0 -0
  13. app/__pycache__/llm_client.cpython-310.pyc +0 -0
  14. app/__pycache__/llm_client.cpython-38.pyc +0 -0
  15. app/__pycache__/pipeline.cpython-310.pyc +0 -0
  16. app/__pycache__/pipeline.cpython-38.pyc +0 -0
  17. app/__pycache__/prompts.cpython-310.pyc +0 -0
  18. app/__pycache__/prompts_rag.cpython-310.pyc +0 -0
  19. app/__pycache__/rag_client.cpython-310.pyc +0 -0
  20. app/__pycache__/rag_reporter.cpython-310.pyc +0 -0
  21. app/__pycache__/reporter.cpython-310.pyc +0 -0
  22. app/__pycache__/retriever.cpython-310.pyc +0 -0
  23. app/__pycache__/schemas.cpython-310.pyc +0 -0
  24. app/__pycache__/schemas.cpython-38.pyc +0 -0
  25. app/__pycache__/video_payload.cpython-310.pyc +0 -0
  26. app/__pycache__/video_payload.cpython-38.pyc +0 -0
  27. app/__pycache__/video_preprocess.cpython-310.pyc +0 -0
  28. app/agents/__init__.py +1 -0
  29. app/agents/__pycache__/__init__.cpython-310.pyc +0 -0
  30. app/agents/__pycache__/audio_agent.cpython-310.pyc +0 -0
  31. app/agents/__pycache__/base.cpython-310.pyc +0 -0
  32. app/agents/__pycache__/document_agent.cpython-310.pyc +0 -0
  33. app/agents/__pycache__/inappropriate_agent.cpython-310.pyc +0 -0
  34. app/agents/__pycache__/look_agent.cpython-310.pyc +0 -0
  35. app/agents/__pycache__/point_agent.cpython-310.pyc +0 -0
  36. app/agents/__pycache__/respond_agent.cpython-310.pyc +0 -0
  37. app/agents/__pycache__/speak_agent.cpython-310.pyc +0 -0
  38. app/agents/__pycache__/timeline_agent.cpython-310.pyc +0 -0
  39. app/agents/audio_agent.py +209 -0
  40. app/agents/base.py +59 -0
  41. app/agents/document_agent.py +95 -0
  42. app/agents/inappropriate_agent.py +6 -0
  43. app/agents/look_agent.py +6 -0
  44. app/agents/point_agent.py +6 -0
  45. app/agents/respond_agent.py +6 -0
  46. app/agents/speak_agent.py +6 -0
  47. app/agents/timeline_agent.py +6 -0
  48. app/aggregator.py +86 -0
  49. app/audio_utils.py +114 -0
  50. app/config.py +121 -0
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # empty
app/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (127 Bytes). View file
 
app/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (125 Bytes). View file
 
app/__pycache__/aggregator.cpython-310.pyc ADDED
Binary file (1.9 kB). View file
 
app/__pycache__/aggregator.cpython-38.pyc ADDED
Binary file (1.87 kB). View file
 
app/__pycache__/audio_utils.cpython-310.pyc ADDED
Binary file (3.37 kB). View file
 
app/__pycache__/config.cpython-310.pyc ADDED
Binary file (4.1 kB). View file
 
app/__pycache__/config.cpython-38.pyc ADDED
Binary file (3.23 kB). View file
 
app/__pycache__/document_utils.cpython-310.pyc ADDED
Binary file (3.06 kB). View file
 
app/__pycache__/evidence_builder.cpython-310.pyc ADDED
Binary file (1.17 kB). View file
 
app/__pycache__/file_utils.cpython-310.pyc ADDED
Binary file (987 Bytes). View file
 
app/__pycache__/file_utils.cpython-38.pyc ADDED
Binary file (899 Bytes). View file
 
app/__pycache__/llm_client.cpython-310.pyc ADDED
Binary file (3.03 kB). View file
 
app/__pycache__/llm_client.cpython-38.pyc ADDED
Binary file (2.89 kB). View file
 
app/__pycache__/pipeline.cpython-310.pyc ADDED
Binary file (16.7 kB). View file
 
app/__pycache__/pipeline.cpython-38.pyc ADDED
Binary file (7.21 kB). View file
 
app/__pycache__/prompts.cpython-310.pyc ADDED
Binary file (3.58 kB). View file
 
app/__pycache__/prompts_rag.cpython-310.pyc ADDED
Binary file (2.89 kB). View file
 
app/__pycache__/rag_client.cpython-310.pyc ADDED
Binary file (2.23 kB). View file
 
app/__pycache__/rag_reporter.cpython-310.pyc ADDED
Binary file (1.27 kB). View file
 
app/__pycache__/reporter.cpython-310.pyc ADDED
Binary file (999 Bytes). View file
 
app/__pycache__/retriever.cpython-310.pyc ADDED
Binary file (5.03 kB). View file
 
app/__pycache__/schemas.cpython-310.pyc ADDED
Binary file (1.7 kB). View file
 
app/__pycache__/schemas.cpython-38.pyc ADDED
Binary file (1.64 kB). View file
 
app/__pycache__/video_payload.cpython-310.pyc ADDED
Binary file (1.58 kB). View file
 
app/__pycache__/video_payload.cpython-38.pyc ADDED
Binary file (1.51 kB). View file
 
app/__pycache__/video_preprocess.cpython-310.pyc ADDED
Binary file (3.72 kB). View file
 
app/agents/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # empty
app/agents/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (134 Bytes). View file
 
app/agents/__pycache__/audio_agent.cpython-310.pyc ADDED
Binary file (6.5 kB). View file
 
app/agents/__pycache__/base.cpython-310.pyc ADDED
Binary file (2.09 kB). View file
 
app/agents/__pycache__/document_agent.cpython-310.pyc ADDED
Binary file (3.11 kB). View file
 
app/agents/__pycache__/inappropriate_agent.cpython-310.pyc ADDED
Binary file (535 Bytes). View file
 
app/agents/__pycache__/look_agent.cpython-310.pyc ADDED
Binary file (499 Bytes). View file
 
app/agents/__pycache__/point_agent.cpython-310.pyc ADDED
Binary file (503 Bytes). View file
 
app/agents/__pycache__/respond_agent.cpython-310.pyc ADDED
Binary file (511 Bytes). View file
 
app/agents/__pycache__/speak_agent.cpython-310.pyc ADDED
Binary file (503 Bytes). View file
 
app/agents/__pycache__/timeline_agent.cpython-310.pyc ADDED
Binary file (515 Bytes). View file
 
app/agents/audio_agent.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from typing import Any
6
+
7
+ from app.llm_client import chat_completion_json, safe_json_loads, ApiKeyPool
8
+ from app.schemas import AgentResult, Finding
9
+ from app.audio_utils import audio_file_to_data_uri, AudioChunk
10
+
11
+
12
+ AUDIO_ANALYSIS_SYSTEM = """
13
+ 你是儿童孤独症谱系障碍辅助筛查系统中的“音频智能体”。
14
+ 输入是结构化诊断对话视频中提取出的音频转写结果。
15
+ 你的任务不是做正式诊断,而是从转写内容中提取与“五不”相关的语音/语言证据。
16
+ 必须只输出严格 JSON,不要输出 markdown,不要解释。
17
+ """.strip()
18
+
19
+
20
+ AUDIO_ANALYSIS_USER_TEMPLATE = """
21
+ 请根据以下分段音频转写内容,提取与“五不”相关的证据。
22
+
23
+ 重点关注:
24
+ 1. 不(少)应:
25
+ - 问答中响应不足
26
+ - 明显不回应
27
+ - 语言互动参与低
28
+ 2. 不(少)语:
29
+ - 主动语言少
30
+ - 回答很短
31
+ - 交流性语言不足
32
+ 3. 不当:
33
+ - 明显答非所问
34
+ - 重复性语言
35
+ - 明显异常语用
36
+
37
+ 输出格式:
38
+ {
39
+ "findings": [
40
+ {
41
+ "warning_type": "不(少)应|不(少)语|不当",
42
+ "start_sec": 12.0,
43
+ "end_sec": 30.0,
44
+ "confidence": 0.82,
45
+ "evidence": "简洁说明",
46
+ "behavior_tags": ["标签1", "标签2"],
47
+ "clinical_note": "临床解释",
48
+ "clip_summary": "一句话摘要",
49
+ "modality_limit": "audio_transcript_based"
50
+ }
51
+ ],
52
+ "clip_level_summary": "总体摘要"
53
+ }
54
+
55
+ 转写输入如下:
56
+ <<TRANSCRIPT_JSON>>
57
+ """.strip()
58
+
59
+
60
+ class AudioAgent:
61
+ def __init__(self, model: str, asr_model: str):
62
+ self.model = model
63
+ self.asr_model = asr_model
64
+
65
+ def agent_name(self) -> str:
66
+ return "audio_agent"
67
+
68
+ def transcribe_chunk(self, client, chunk: AudioChunk) -> dict[str, Any]:
69
+ data_uri = audio_file_to_data_uri(chunk.path, mime_type="audio/mpeg")
70
+ completion = client.chat.completions.create(
71
+ model=self.asr_model,
72
+ messages=[
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {
77
+ "type": "input_audio",
78
+ "input_audio": {
79
+ "data": data_uri
80
+ }
81
+ }
82
+ ]
83
+ }
84
+ ],
85
+ stream=False,
86
+ extra_body={
87
+ "asr_options": {
88
+ "enable_itn": False
89
+ }
90
+ },
91
+ timeout=180,
92
+ )
93
+ text = completion.choices[0].message.content or ""
94
+ return {
95
+ "start_sec": chunk.start_sec,
96
+ "end_sec": chunk.end_sec,
97
+ "transcript": text,
98
+ }
99
+
100
+ def _transcribe_serial(self, key_pool: ApiKeyPool, chunks: list[AudioChunk]) -> list[dict[str, Any]]:
101
+ transcripts = []
102
+ for chunk in chunks:
103
+ client = key_pool.get_client()
104
+ transcripts.append(self.transcribe_chunk(client, chunk))
105
+ transcripts.sort(key=lambda x: x["start_sec"])
106
+ return transcripts
107
+
108
+ def _transcribe_parallel(
109
+ self,
110
+ key_pool: ApiKeyPool,
111
+ chunks: list[AudioChunk],
112
+ max_workers: int,
113
+ ) -> list[dict[str, Any]]:
114
+ transcripts = []
115
+
116
+ def run_one(chunk: AudioChunk):
117
+ client = key_pool.get_client()
118
+ return self.transcribe_chunk(client, chunk)
119
+
120
+ with ThreadPoolExecutor(max_workers=min(max_workers, len(chunks))) as ex:
121
+ futures = [ex.submit(run_one, chunk) for chunk in chunks]
122
+ for fut in as_completed(futures):
123
+ transcripts.append(fut.result())
124
+
125
+ transcripts.sort(key=lambda x: x["start_sec"])
126
+ return transcripts
127
+
128
+ def transcribe_chunks(
129
+ self,
130
+ key_pool: ApiKeyPool,
131
+ chunks: list[AudioChunk],
132
+ valid_key_count: int,
133
+ max_workers: int,
134
+ log_cb=None,
135
+ ) -> list[dict[str, Any]]:
136
+ log_cb = log_cb or (lambda msg: None)
137
+
138
+ if not chunks:
139
+ return []
140
+
141
+ if valid_key_count <= 1:
142
+ log_cb("音频 ASR 使用串行模式。")
143
+ return self._transcribe_serial(key_pool, chunks)
144
+
145
+ log_cb("音频 ASR 使用并发轮转模式。")
146
+ return self._transcribe_parallel(
147
+ key_pool=key_pool,
148
+ chunks=chunks,
149
+ max_workers=max_workers,
150
+ )
151
+
152
+ def analyze_transcripts(self, client, transcripts: list[dict[str, Any]]) -> AgentResult:
153
+ user_prompt = AUDIO_ANALYSIS_USER_TEMPLATE.replace(
154
+ "<<TRANSCRIPT_JSON>>",
155
+ json.dumps(transcripts, ensure_ascii=False, indent=2),
156
+ )
157
+
158
+ raw_text = chat_completion_json(
159
+ client=client,
160
+ model=self.model,
161
+ messages=[
162
+ {"role": "system", "content": AUDIO_ANALYSIS_SYSTEM},
163
+ {"role": "user", "content": user_prompt},
164
+ ],
165
+ temperature=0.1,
166
+ timeout=180,
167
+ )
168
+
169
+ try:
170
+ payload = safe_json_loads(raw_text)
171
+ findings = [Finding(**f) for f in payload.get("findings", [])]
172
+ clip_level_summary = payload.get("clip_level_summary", "")
173
+ except Exception:
174
+ findings = []
175
+ clip_level_summary = "音频智能体输出解析失败,建议人工复核。"
176
+
177
+ return AgentResult(
178
+ agent_name=self.agent_name(),
179
+ clip_start_sec=0.0,
180
+ clip_end_sec=0.0,
181
+ findings=findings,
182
+ clip_level_summary=clip_level_summary,
183
+ raw_text=raw_text,
184
+ )
185
+
186
+ def run(
187
+ self,
188
+ key_pool: ApiKeyPool,
189
+ audio_chunks: list[AudioChunk],
190
+ valid_key_count: int,
191
+ max_workers: int,
192
+ log_cb=None,
193
+ ) -> tuple[AgentResult, list[dict[str, Any]]]:
194
+ log_cb = log_cb or (lambda msg: None)
195
+
196
+ transcripts = self.transcribe_chunks(
197
+ key_pool=key_pool,
198
+ chunks=audio_chunks,
199
+ valid_key_count=valid_key_count,
200
+ max_workers=max_workers,
201
+ log_cb=log_cb,
202
+ )
203
+
204
+ client = key_pool.get_client()
205
+ result = self.analyze_transcripts(client, transcripts)
206
+ return result, transcripts
207
+
208
+
209
+
app/agents/base.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any
5
+
6
+ from app.llm_client import chat_completion_json, safe_json_loads
7
+ from app.schemas import AgentResult, Finding
8
+ from app.prompts import build_agent_prompt
9
+ from app.video_payload import VideoPayload
10
+
11
+
12
+ class BaseAgent(ABC):
13
+ def __init__(self, model: str):
14
+ self.model = model
15
+
16
+ @abstractmethod
17
+ def agent_name(self) -> str:
18
+ raise NotImplementedError
19
+
20
+ def build_messages(self, video_payload: VideoPayload) -> list[dict[str, Any]]:
21
+ content = [
22
+ {
23
+ "type": "video_url",
24
+ "video_url": {
25
+ "url": video_payload.value
26
+ },
27
+ "fps": video_payload.fps,
28
+ },
29
+ {
30
+ "type": "text",
31
+ "text": build_agent_prompt(self.agent_name()),
32
+ },
33
+ ]
34
+ return [{"role": "user", "content": content}]
35
+
36
+ def run(self, client, video_payload: VideoPayload) -> AgentResult:
37
+ messages = self.build_messages(video_payload)
38
+ raw_text = chat_completion_json(client=client, model=self.model, messages=messages)
39
+
40
+ try:
41
+ payload = safe_json_loads(raw_text)
42
+ findings = [Finding(**f) for f in payload.get("findings", [])]
43
+ clip_level_summary = payload.get("clip_level_summary", "")
44
+ except Exception:
45
+ findings = []
46
+ clip_level_summary = "模型输出解析失败,建议人工复核。"
47
+
48
+ # 这里补齐 clip_start_sec / clip_end_sec
49
+ # 当前整段视频输入,没有显式分段,因此先统一设为 0
50
+ # 真正的时间信息仍然以 findings 里的 start_sec / end_sec 为准
51
+ return AgentResult(
52
+ agent_name=self.agent_name(),
53
+ clip_start_sec=0.0,
54
+ clip_end_sec=0.0,
55
+ findings=findings,
56
+ clip_level_summary=clip_level_summary,
57
+ raw_text=raw_text,
58
+ )
59
+
app/agents/document_agent.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from app.llm_client import chat_completion_json, safe_json_loads
4
+ from app.schemas import AgentResult, Finding
5
+
6
+
7
+ DOCUMENT_AGENT_SYSTEM = """
8
+ 你是一个儿童孤独症谱系障碍辅助筛查系统中的“文档智能体”。
9
+ 你的任务是分析患者/家属/医生提供的文档材料,包括:
10
+ - 病例资料
11
+ - 孤独症检测表/量表
12
+ - 问卷
13
+ - 观察记录
14
+ - 其他结构化或半结构化文档
15
+
16
+ 你不是做正式诊断,而是抽取与“五不”辅助筛查相关的文档证据。
17
+ 你必须只输出严格 JSON,不要输出 markdown,不要解释。
18
+ """.strip()
19
+
20
+
21
+ DOCUMENT_AGENT_USER_TEMPLATE = """
22
+ 请分析以下文档内容,并输出结构化 JSON。
23
+
24
+ 要求:
25
+ 1. 关注“五不”:
26
+ - 不(少)看
27
+ - 不(少)应
28
+ - 不(少)指
29
+ - 不(少)语
30
+ - 不当
31
+ 2. 每条 finding 使用以下格式:
32
+ {
33
+ "warning_type": "不(少)看|不(少)应|不(少)指|不(少)语|不当",
34
+ "start_sec": 0,
35
+ "end_sec": 0,
36
+ "confidence": 0.0,
37
+ "evidence": "从文档提炼的关键证据",
38
+ "behavior_tags": ["标签1", "标签2"],
39
+ "clinical_note": "简短临床解释",
40
+ "clip_summary": "一句话摘要",
41
+ "modality_limit": "document_only"
42
+ }
43
+ 3. 文档证据没有视频时间戳,因此 start_sec 和 end_sec 固定为 0。
44
+ 4. 若某一维度没有足够证据,可以不输出。
45
+ 5. 输出格式必须为:
46
+ {
47
+ "findings": [...],
48
+ "clip_level_summary": "文档总体摘要"
49
+ }
50
+
51
+ 文档内容如下:
52
+ <<DOCUMENT_BUNDLE>>
53
+ """.strip()
54
+
55
+
56
+ class DocumentAgent:
57
+ def __init__(self, model: str):
58
+ self.model = model
59
+
60
+ def agent_name(self) -> str:
61
+ return "document_agent"
62
+
63
+ def run(self, client, document_bundle: str) -> AgentResult:
64
+ messages = [
65
+ {"role": "system", "content": DOCUMENT_AGENT_SYSTEM},
66
+ {
67
+ "role": "user",
68
+ "content": DOCUMENT_AGENT_USER_TEMPLATE.replace("<<DOCUMENT_BUNDLE>>", document_bundle),
69
+ },
70
+ ]
71
+
72
+ raw_text = chat_completion_json(
73
+ client=client,
74
+ model=self.model,
75
+ messages=messages,
76
+ temperature=0.1,
77
+ timeout=180,
78
+ )
79
+
80
+ try:
81
+ payload = safe_json_loads(raw_text)
82
+ findings = [Finding(**f) for f in payload.get("findings", [])]
83
+ clip_level_summary = payload.get("clip_level_summary", "")
84
+ except Exception:
85
+ findings = []
86
+ clip_level_summary = "文档智能体输出解析失败,建议人工复核。"
87
+
88
+ return AgentResult(
89
+ agent_name=self.agent_name(),
90
+ findings=findings,
91
+ clip_start_sec=0.0,
92
+ clip_end_sec=0.0,
93
+ clip_level_summary=clip_level_summary,
94
+ raw_text=raw_text,
95
+ )
app/agents/inappropriate_agent.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from app.agents.base import BaseAgent
2
+
3
+
4
+ class InappropriateAgent(BaseAgent):
5
+ def agent_name(self) -> str:
6
+ return "inappropriate_agent"
app/agents/look_agent.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from app.agents.base import BaseAgent
2
+
3
+
4
+ class LookAgent(BaseAgent):
5
+ def agent_name(self) -> str:
6
+ return "look_agent"
app/agents/point_agent.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from app.agents.base import BaseAgent
2
+
3
+
4
+ class PointAgent(BaseAgent):
5
+ def agent_name(self) -> str:
6
+ return "point_agent"
app/agents/respond_agent.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from app.agents.base import BaseAgent
2
+
3
+
4
+ class RespondAgent(BaseAgent):
5
+ def agent_name(self) -> str:
6
+ return "respond_agent"
app/agents/speak_agent.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from app.agents.base import BaseAgent
2
+
3
+
4
+ class SpeakAgent(BaseAgent):
5
+ def agent_name(self) -> str:
6
+ return "speak_agent"
app/agents/timeline_agent.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from app.agents.base import BaseAgent
2
+
3
+
4
+ class TimelineAgent(BaseAgent):
5
+ def agent_name(self) -> str:
6
+ return "timeline_agent"
app/aggregator.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ from app.schemas import AgentResult, MergedEvent
5
+
6
+
7
+ def merge_overlapping_results(results: list[AgentResult], iou_gap: float = 2.0) -> list[MergedEvent]:
8
+ grouped = defaultdict(list)
9
+
10
+ for result in results:
11
+ for finding in result.findings:
12
+ grouped[finding.warning_type].append((result.agent_name, finding))
13
+
14
+ merged: list[MergedEvent] = []
15
+
16
+ for warning_type, items in grouped.items():
17
+ items.sort(key=lambda x: x[1].start_sec)
18
+
19
+ current = None
20
+
21
+ for agent_name, f in items:
22
+ if current is None:
23
+ current = {
24
+ "warning_type": warning_type,
25
+ "start_sec": f.start_sec,
26
+ "end_sec": f.end_sec,
27
+ "confidence_sum": f.confidence,
28
+ "count": 1,
29
+ "evidences": [f.evidence],
30
+ "sources": [agent_name],
31
+ "behavior_tags": set(f.behavior_tags),
32
+ "clinical_note": f.clinical_note or "",
33
+ }
34
+ continue
35
+
36
+ overlap = f.start_sec <= current["end_sec"] + iou_gap
37
+ if overlap:
38
+ current["end_sec"] = max(current["end_sec"], f.end_sec)
39
+ current["confidence_sum"] += f.confidence
40
+ current["count"] += 1
41
+ current["evidences"].append(f.evidence)
42
+ current["sources"].append(agent_name)
43
+ current["behavior_tags"].update(f.behavior_tags)
44
+ if len(f.clinical_note) > len(current["clinical_note"]):
45
+ current["clinical_note"] = f.clinical_note
46
+ else:
47
+ merged.append(
48
+ MergedEvent(
49
+ warning_type=current["warning_type"],
50
+ start_sec=current["start_sec"],
51
+ end_sec=current["end_sec"],
52
+ confidence=min(1.0, current["confidence_sum"] / current["count"]),
53
+ evidences=current["evidences"],
54
+ sources=current["sources"],
55
+ behavior_tags=sorted(current["behavior_tags"]),
56
+ clinical_note=current["clinical_note"],
57
+ )
58
+ )
59
+ current = {
60
+ "warning_type": warning_type,
61
+ "start_sec": f.start_sec,
62
+ "end_sec": f.end_sec,
63
+ "confidence_sum": f.confidence,
64
+ "count": 1,
65
+ "evidences": [f.evidence],
66
+ "sources": [agent_name],
67
+ "behavior_tags": set(f.behavior_tags),
68
+ "clinical_note": f.clinical_note or "",
69
+ }
70
+
71
+ if current is not None:
72
+ merged.append(
73
+ MergedEvent(
74
+ warning_type=current["warning_type"],
75
+ start_sec=current["start_sec"],
76
+ end_sec=current["end_sec"],
77
+ confidence=min(1.0, current["confidence_sum"] / current["count"]),
78
+ evidences=current["evidences"],
79
+ sources=current["sources"],
80
+ behavior_tags=sorted(current["behavior_tags"]),
81
+ clinical_note=current["clinical_note"],
82
+ )
83
+ )
84
+
85
+ merged.sort(key=lambda x: (x.start_sec, x.warning_type))
86
+ return merged
app/audio_utils.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import os
5
+ import shutil
6
+ import subprocess
7
+ from dataclasses import dataclass
8
+ from typing import List
9
+
10
+
11
+ @dataclass
12
+ class AudioChunk:
13
+ path: str
14
+ start_sec: float
15
+ end_sec: float
16
+
17
+
18
+ def check_ffmpeg_available() -> bool:
19
+ return shutil.which("ffmpeg") is not None and shutil.which("ffprobe") is not None
20
+
21
+
22
+ def ensure_dir(path: str) -> None:
23
+ os.makedirs(path, exist_ok=True)
24
+
25
+
26
+ def get_media_duration(path: str) -> float:
27
+ cmd = [
28
+ "ffprobe",
29
+ "-v", "error",
30
+ "-show_entries", "format=duration",
31
+ "-of", "default=noprint_wrappers=1:nokey=1",
32
+ path,
33
+ ]
34
+ proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
35
+ if proc.returncode != 0:
36
+ raise RuntimeError(f"ffprobe 获取时长失败: {proc.stderr}")
37
+ return float(proc.stdout.strip())
38
+
39
+
40
+ def extract_audio_from_video(
41
+ video_path: str,
42
+ output_audio_path: str,
43
+ bitrate: str = "64k",
44
+ ) -> str:
45
+ """
46
+ 从视频中抽取音频,转成 16k 单声道 mp3,便于后续 ASR。
47
+ """
48
+ if not os.path.exists(video_path):
49
+ raise FileNotFoundError(f"视频不存在: {video_path}")
50
+ if not check_ffmpeg_available():
51
+ raise RuntimeError("未检测到 ffmpeg/ffprobe,请先安装 ffmpeg。")
52
+
53
+ ensure_dir(os.path.dirname(output_audio_path))
54
+
55
+ cmd = [
56
+ "ffmpeg",
57
+ "-y",
58
+ "-i", video_path,
59
+ "-vn",
60
+ "-ac", "1",
61
+ "-ar", "16000",
62
+ "-c:a", "mp3",
63
+ "-b:a", bitrate,
64
+ output_audio_path,
65
+ ]
66
+ proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
67
+ if proc.returncode != 0:
68
+ raise RuntimeError(f"抽取音频失败:\n{proc.stderr}")
69
+ return output_audio_path
70
+
71
+
72
+ def split_audio_to_chunks(
73
+ audio_path: str,
74
+ output_dir: str,
75
+ chunk_seconds: int = 290,
76
+ ) -> List[AudioChunk]:
77
+ """
78
+ 按固定时长切音频,避免超过 qwen3-asr-flash 的单次时长限制。
79
+ """
80
+ ensure_dir(output_dir)
81
+ duration = get_media_duration(audio_path)
82
+ chunks: List[AudioChunk] = []
83
+
84
+ start = 0.0
85
+ idx = 0
86
+ while start < duration:
87
+ end = min(duration, start + chunk_seconds)
88
+ chunk_path = os.path.join(output_dir, f"audio_chunk_{idx:03d}.mp3")
89
+
90
+ cmd = [
91
+ "ffmpeg",
92
+ "-y",
93
+ "-i", audio_path,
94
+ "-ss", str(start),
95
+ "-t", str(end - start),
96
+ "-acodec", "copy",
97
+ chunk_path,
98
+ ]
99
+ proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
100
+ if proc.returncode != 0:
101
+ raise RuntimeError(f"切分音频失败:\n{proc.stderr}")
102
+
103
+ chunks.append(AudioChunk(path=chunk_path, start_sec=start, end_sec=end))
104
+ start = end
105
+ idx += 1
106
+
107
+ return chunks
108
+
109
+
110
+ def audio_file_to_data_uri(audio_path: str, mime_type: str = "audio/mpeg") -> str:
111
+ with open(audio_path, "rb") as f:
112
+ b64 = base64.b64encode(f.read()).decode("utf-8")
113
+ return f"data:{mime_type};base64,{b64}"
114
+
app/config.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+
10
+ def _clean_env_value(value: str) -> str:
11
+ value = (value or "").strip()
12
+ if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
13
+ value = value[1:-1].strip()
14
+ return value
15
+
16
+
17
+ def _to_bool(value: str, default: bool = False) -> bool:
18
+ if value is None:
19
+ return default
20
+ value = _clean_env_value(value).lower()
21
+ return value in {"1", "true", "yes", "y", "on"}
22
+
23
+
24
+ @dataclass
25
+ class Settings:
26
+ api_keys: list[str]
27
+ base_url: str
28
+ model: str
29
+ max_workers: int
30
+ video_input_mode: str
31
+ video_mime_type: str
32
+ video_fps: int
33
+ output_dir: str
34
+
35
+ enable_rag_final: bool
36
+
37
+ alibaba_cloud_access_key_id: str
38
+ alibaba_cloud_access_key_secret: str
39
+ bailian_workspace_id: str
40
+ bailian_index_id: str
41
+ bailian_retrieve_topn: int
42
+ bailian_retrieve_enable_rerank: bool
43
+ bailian_retrieve_dense_topk: int
44
+ bailian_retrieve_sparse_topk: int
45
+ bailian_retrieve_min_score: float
46
+
47
+ enable_video_preprocess: bool
48
+ video_preprocess_mode: str
49
+ video_preprocess_remove_audio: bool
50
+ preprocessed_video_dir: str
51
+
52
+ # 新增:音频智能体
53
+ enable_audio_agent: bool
54
+ audio_asr_model: str
55
+ audio_chunk_seconds: int
56
+ extracted_audio_dir: str
57
+
58
+ @staticmethod
59
+ def load() -> "Settings":
60
+ raw_keys = _clean_env_value(os.getenv("DASHSCOPE_API_KEYS", ""))
61
+ api_keys = []
62
+ for item in raw_keys.split(","):
63
+ k = _clean_env_value(item)
64
+ if k:
65
+ api_keys.append(k)
66
+
67
+ if not api_keys:
68
+ raise ValueError("DASHSCOPE_API_KEYS 为空,请在 .env 中配置至少一个有效 API Key。")
69
+
70
+ video_input_mode = _clean_env_value(os.getenv("VIDEO_INPUT_MODE", "base64")).lower()
71
+ if video_input_mode not in {"base64", "remote_url"}:
72
+ raise ValueError("VIDEO_INPUT_MODE 只能是 base64 或 remote_url。")
73
+
74
+ enable_rag_final = _to_bool(os.getenv("ENABLE_RAG_FINAL", "false"))
75
+
76
+ settings = Settings(
77
+ api_keys=api_keys,
78
+ base_url=_clean_env_value(os.getenv("DASHSCOPE_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")),
79
+ model=_clean_env_value(os.getenv("QWEN_MODEL", "qwen3.5-plus")),
80
+ max_workers=int(_clean_env_value(os.getenv("MAX_WORKERS", "6"))),
81
+ video_input_mode=video_input_mode,
82
+ video_mime_type=_clean_env_value(os.getenv("VIDEO_MIME_TYPE", "video/mp4")),
83
+ video_fps=int(_clean_env_value(os.getenv("VIDEO_FPS", "2"))),
84
+ output_dir=_clean_env_value(os.getenv("OUTPUT_DIR", "outputs")),
85
+ enable_rag_final=enable_rag_final,
86
+ alibaba_cloud_access_key_id=_clean_env_value(os.getenv("ALIBABA_CLOUD_ACCESS_KEY_ID", "")),
87
+ alibaba_cloud_access_key_secret=_clean_env_value(os.getenv("ALIBABA_CLOUD_ACCESS_KEY_SECRET", "")),
88
+ bailian_workspace_id=_clean_env_value(os.getenv("BAILIAN_WORKSPACE_ID", "")),
89
+ bailian_index_id=_clean_env_value(os.getenv("BAILIAN_INDEX_ID", "")),
90
+ bailian_retrieve_topn=int(_clean_env_value(os.getenv("BAILIAN_RETRIEVE_TOPN", "6"))),
91
+ bailian_retrieve_enable_rerank=_to_bool(os.getenv("BAILIAN_RETRIEVE_ENABLE_RERANK", "true")),
92
+ bailian_retrieve_dense_topk=int(_clean_env_value(os.getenv("BAILIAN_RETRIEVE_DENSE_TOPK", "20"))),
93
+ bailian_retrieve_sparse_topk=int(_clean_env_value(os.getenv("BAILIAN_RETRIEVE_SPARSE_TOPK", "20"))),
94
+ bailian_retrieve_min_score=float(_clean_env_value(os.getenv("BAILIAN_RETRIEVE_MIN_SCORE", "0.15"))),
95
+ enable_video_preprocess=_to_bool(os.getenv("ENABLE_VIDEO_PREPROCESS", "true")),
96
+ video_preprocess_mode=_clean_env_value(os.getenv("VIDEO_PREPROCESS_MODE", "analysis")),
97
+ video_preprocess_remove_audio=_to_bool(os.getenv("VIDEO_PREPROCESS_REMOVE_AUDIO", "false")),
98
+ preprocessed_video_dir=_clean_env_value(os.getenv("PREPROCESSED_VIDEO_DIR", "preprocessed_videos")),
99
+
100
+ enable_audio_agent=_to_bool(os.getenv("ENABLE_AUDIO_AGENT", "true")),
101
+ audio_asr_model=_clean_env_value(os.getenv("AUDIO_ASR_MODEL", "qwen3-asr-flash")),
102
+ audio_chunk_seconds=int(_clean_env_value(os.getenv("AUDIO_CHUNK_SECONDS", "290"))),
103
+ extracted_audio_dir=_clean_env_value(os.getenv("EXTRACTED_AUDIO_DIR", "extracted_audio")),
104
+ )
105
+
106
+ if settings.enable_rag_final:
107
+ missing = []
108
+ if not settings.alibaba_cloud_access_key_id:
109
+ missing.append("ALIBABA_CLOUD_ACCESS_KEY_ID")
110
+ if not settings.alibaba_cloud_access_key_secret:
111
+ missing.append("ALIBABA_CLOUD_ACCESS_KEY_SECRET")
112
+ if not settings.bailian_workspace_id:
113
+ missing.append("BAILIAN_WORKSPACE_ID")
114
+ if not settings.bailian_index_id:
115
+ missing.append("BAILIAN_INDEX_ID")
116
+ if missing:
117
+ raise ValueError(
118
+ "ENABLE_RAG_FINAL=true 时,以下环境变量必须配置:" + ", ".join(missing)
119
+ )
120
+
121
+ return settings