Spaces:

Nx-Neuralon
/

ASD

Sleeping

App Files Files Community

Nx-Neuralon commited on 29 days ago

Commit

b6d0232

verified ·

1 Parent(s): ec43701

Upload 64 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app/__init__.py +1 -0
app/__pycache__/__init__.cpython-310.pyc +0 -0
app/__pycache__/__init__.cpython-38.pyc +0 -0
app/__pycache__/aggregator.cpython-310.pyc +0 -0
app/__pycache__/aggregator.cpython-38.pyc +0 -0
app/__pycache__/audio_utils.cpython-310.pyc +0 -0
app/__pycache__/config.cpython-310.pyc +0 -0
app/__pycache__/config.cpython-38.pyc +0 -0
app/__pycache__/document_utils.cpython-310.pyc +0 -0
app/__pycache__/evidence_builder.cpython-310.pyc +0 -0
app/__pycache__/file_utils.cpython-310.pyc +0 -0
app/__pycache__/file_utils.cpython-38.pyc +0 -0
app/__pycache__/llm_client.cpython-310.pyc +0 -0
app/__pycache__/llm_client.cpython-38.pyc +0 -0
app/__pycache__/pipeline.cpython-310.pyc +0 -0
app/__pycache__/pipeline.cpython-38.pyc +0 -0
app/__pycache__/prompts.cpython-310.pyc +0 -0
app/__pycache__/prompts_rag.cpython-310.pyc +0 -0
app/__pycache__/rag_client.cpython-310.pyc +0 -0
app/__pycache__/rag_reporter.cpython-310.pyc +0 -0
app/__pycache__/reporter.cpython-310.pyc +0 -0
app/__pycache__/retriever.cpython-310.pyc +0 -0
app/__pycache__/schemas.cpython-310.pyc +0 -0
app/__pycache__/schemas.cpython-38.pyc +0 -0
app/__pycache__/video_payload.cpython-310.pyc +0 -0
app/__pycache__/video_payload.cpython-38.pyc +0 -0
app/__pycache__/video_preprocess.cpython-310.pyc +0 -0
app/agents/__init__.py +1 -0
app/agents/__pycache__/__init__.cpython-310.pyc +0 -0
app/agents/__pycache__/audio_agent.cpython-310.pyc +0 -0
app/agents/__pycache__/base.cpython-310.pyc +0 -0
app/agents/__pycache__/document_agent.cpython-310.pyc +0 -0
app/agents/__pycache__/inappropriate_agent.cpython-310.pyc +0 -0
app/agents/__pycache__/look_agent.cpython-310.pyc +0 -0
app/agents/__pycache__/point_agent.cpython-310.pyc +0 -0
app/agents/__pycache__/respond_agent.cpython-310.pyc +0 -0
app/agents/__pycache__/speak_agent.cpython-310.pyc +0 -0
app/agents/__pycache__/timeline_agent.cpython-310.pyc +0 -0
app/agents/audio_agent.py +209 -0
app/agents/base.py +59 -0
app/agents/document_agent.py +95 -0
app/agents/inappropriate_agent.py +6 -0
app/agents/look_agent.py +6 -0
app/agents/point_agent.py +6 -0
app/agents/respond_agent.py +6 -0
app/agents/speak_agent.py +6 -0
app/agents/timeline_agent.py +6 -0
app/aggregator.py +86 -0
app/audio_utils.py +114 -0
app/config.py +121 -0

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # empty

app/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (127 Bytes). View file

app/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (125 Bytes). View file

app/__pycache__/aggregator.cpython-310.pyc ADDED Viewed

Binary file (1.9 kB). View file

app/__pycache__/aggregator.cpython-38.pyc ADDED Viewed

Binary file (1.87 kB). View file

app/__pycache__/audio_utils.cpython-310.pyc ADDED Viewed

Binary file (3.37 kB). View file

app/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (4.1 kB). View file

app/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (3.23 kB). View file

app/__pycache__/document_utils.cpython-310.pyc ADDED Viewed

Binary file (3.06 kB). View file

app/__pycache__/evidence_builder.cpython-310.pyc ADDED Viewed

Binary file (1.17 kB). View file

app/__pycache__/file_utils.cpython-310.pyc ADDED Viewed

Binary file (987 Bytes). View file

app/__pycache__/file_utils.cpython-38.pyc ADDED Viewed

Binary file (899 Bytes). View file

app/__pycache__/llm_client.cpython-310.pyc ADDED Viewed

Binary file (3.03 kB). View file

app/__pycache__/llm_client.cpython-38.pyc ADDED Viewed

Binary file (2.89 kB). View file

app/__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (16.7 kB). View file

app/__pycache__/pipeline.cpython-38.pyc ADDED Viewed

Binary file (7.21 kB). View file

app/__pycache__/prompts.cpython-310.pyc ADDED Viewed

Binary file (3.58 kB). View file

app/__pycache__/prompts_rag.cpython-310.pyc ADDED Viewed

Binary file (2.89 kB). View file

app/__pycache__/rag_client.cpython-310.pyc ADDED Viewed

Binary file (2.23 kB). View file

app/__pycache__/rag_reporter.cpython-310.pyc ADDED Viewed

Binary file (1.27 kB). View file

app/__pycache__/reporter.cpython-310.pyc ADDED Viewed

Binary file (999 Bytes). View file

app/__pycache__/retriever.cpython-310.pyc ADDED Viewed

Binary file (5.03 kB). View file

app/__pycache__/schemas.cpython-310.pyc ADDED Viewed

Binary file (1.7 kB). View file

app/__pycache__/schemas.cpython-38.pyc ADDED Viewed

Binary file (1.64 kB). View file

app/__pycache__/video_payload.cpython-310.pyc ADDED Viewed

Binary file (1.58 kB). View file

app/__pycache__/video_payload.cpython-38.pyc ADDED Viewed

Binary file (1.51 kB). View file

app/__pycache__/video_preprocess.cpython-310.pyc ADDED Viewed

Binary file (3.72 kB). View file

app/agents/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # empty

app/agents/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (134 Bytes). View file

app/agents/__pycache__/audio_agent.cpython-310.pyc ADDED Viewed

Binary file (6.5 kB). View file

app/agents/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (2.09 kB). View file

app/agents/__pycache__/document_agent.cpython-310.pyc ADDED Viewed

Binary file (3.11 kB). View file

app/agents/__pycache__/inappropriate_agent.cpython-310.pyc ADDED Viewed

Binary file (535 Bytes). View file

app/agents/__pycache__/look_agent.cpython-310.pyc ADDED Viewed

Binary file (499 Bytes). View file

app/agents/__pycache__/point_agent.cpython-310.pyc ADDED Viewed

Binary file (503 Bytes). View file

app/agents/__pycache__/respond_agent.cpython-310.pyc ADDED Viewed

Binary file (511 Bytes). View file

app/agents/__pycache__/speak_agent.cpython-310.pyc ADDED Viewed

Binary file (503 Bytes). View file

app/agents/__pycache__/timeline_agent.cpython-310.pyc ADDED Viewed

Binary file (515 Bytes). View file

app/agents/audio_agent.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from __future__ import annotations
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any
+from app.llm_client import chat_completion_json, safe_json_loads, ApiKeyPool
+from app.schemas import AgentResult, Finding
+from app.audio_utils import audio_file_to_data_uri, AudioChunk
+AUDIO_ANALYSIS_SYSTEM = """
+你是儿童孤独症谱系障碍辅助筛查系统中的“音频智能体”。
+输入是结构化诊断对话视频中提取出的音频转写结果。
+你的任务不是做正式诊断，而是从转写内容中提取与“五不”相关的语音/语言证据。
+必须只输出严格 JSON，不要输出 markdown，不要解释。
+""".strip()
+AUDIO_ANALYSIS_USER_TEMPLATE = """
+请根据以下分段音频转写内容，提取与“五不”相关的证据。
+重点关注：
+1. 不（少）应：
+   - 问答中响应不足
+   - 明显不回应
+   - 语言互动参与低
+2. 不（少）语：
+   - 主动语言少
+   - 回答很短
+   - 交流性语言不足
+3. 不当：
+   - 明显答非所问
+   - 重复性语言
+   - 明显异常语用
+输出格式：
+{
+  "findings": [
+    {
+      "warning_type": "不（少）应|不（少）语|不当",
+      "start_sec": 12.0,
+      "end_sec": 30.0,
+      "confidence": 0.82,
+      "evidence": "简洁说明",
+      "behavior_tags": ["标签1", "标签2"],
+      "clinical_note": "临床解释",
+      "clip_summary": "一句话摘要",
+      "modality_limit": "audio_transcript_based"
+    }
+  ],
+  "clip_level_summary": "总体摘要"
+}
+转写输入如下：
+<<TRANSCRIPT_JSON>>
+""".strip()
+class AudioAgent:
+    def __init__(self, model: str, asr_model: str):
+        self.model = model
+        self.asr_model = asr_model
+    def agent_name(self) -> str:
+        return "audio_agent"
+    def transcribe_chunk(self, client, chunk: AudioChunk) -> dict[str, Any]:
+        data_uri = audio_file_to_data_uri(chunk.path, mime_type="audio/mpeg")
+        completion = client.chat.completions.create(
+            model=self.asr_model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_audio",
+                            "input_audio": {
+                                "data": data_uri
+                            }
+                        }
+                    ]
+                }
+            ],
+            stream=False,
+            extra_body={
+                "asr_options": {
+                    "enable_itn": False
+                }
+            },
+            timeout=180,
+        )
+        text = completion.choices[0].message.content or ""
+        return {
+            "start_sec": chunk.start_sec,
+            "end_sec": chunk.end_sec,
+            "transcript": text,
+        }
+    def _transcribe_serial(self, key_pool: ApiKeyPool, chunks: list[AudioChunk]) -> list[dict[str, Any]]:
+        transcripts = []
+        for chunk in chunks:
+            client = key_pool.get_client()
+            transcripts.append(self.transcribe_chunk(client, chunk))
+        transcripts.sort(key=lambda x: x["start_sec"])
+        return transcripts
+    def _transcribe_parallel(
+        self,
+        key_pool: ApiKeyPool,
+        chunks: list[AudioChunk],
+        max_workers: int,
+    ) -> list[dict[str, Any]]:
+        transcripts = []
+        def run_one(chunk: AudioChunk):
+            client = key_pool.get_client()
+            return self.transcribe_chunk(client, chunk)
+        with ThreadPoolExecutor(max_workers=min(max_workers, len(chunks))) as ex:
+            futures = [ex.submit(run_one, chunk) for chunk in chunks]
+            for fut in as_completed(futures):
+                transcripts.append(fut.result())
+        transcripts.sort(key=lambda x: x["start_sec"])
+        return transcripts
+    def transcribe_chunks(
+        self,
+        key_pool: ApiKeyPool,
+        chunks: list[AudioChunk],
+        valid_key_count: int,
+        max_workers: int,
+        log_cb=None,
+    ) -> list[dict[str, Any]]:
+        log_cb = log_cb or (lambda msg: None)
+        if not chunks:
+            return []
+        if valid_key_count <= 1:
+            log_cb("音频 ASR 使用串行模式。")
+            return self._transcribe_serial(key_pool, chunks)
+        log_cb("音频 ASR 使用并发轮转模式。")
+        return self._transcribe_parallel(
+            key_pool=key_pool,
+            chunks=chunks,
+            max_workers=max_workers,
+        )
+    def analyze_transcripts(self, client, transcripts: list[dict[str, Any]]) -> AgentResult:
+        user_prompt = AUDIO_ANALYSIS_USER_TEMPLATE.replace(
+            "<<TRANSCRIPT_JSON>>",
+            json.dumps(transcripts, ensure_ascii=False, indent=2),
+        )
+        raw_text = chat_completion_json(
+            client=client,
+            model=self.model,
+            messages=[
+                {"role": "system", "content": AUDIO_ANALYSIS_SYSTEM},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=0.1,
+            timeout=180,
+        )
+        try:
+            payload = safe_json_loads(raw_text)
+            findings = [Finding(**f) for f in payload.get("findings", [])]
+            clip_level_summary = payload.get("clip_level_summary", "")
+        except Exception:
+            findings = []
+            clip_level_summary = "音频智能体输出解析失败，建议人工复核。"
+        return AgentResult(
+            agent_name=self.agent_name(),
+            clip_start_sec=0.0,
+            clip_end_sec=0.0,
+            findings=findings,
+            clip_level_summary=clip_level_summary,
+            raw_text=raw_text,
+        )
+    def run(
+        self,
+        key_pool: ApiKeyPool,
+        audio_chunks: list[AudioChunk],
+        valid_key_count: int,
+        max_workers: int,
+        log_cb=None,
+    ) -> tuple[AgentResult, list[dict[str, Any]]]:
+        log_cb = log_cb or (lambda msg: None)
+        transcripts = self.transcribe_chunks(
+            key_pool=key_pool,
+            chunks=audio_chunks,
+            valid_key_count=valid_key_count,
+            max_workers=max_workers,
+            log_cb=log_cb,
+        )
+        client = key_pool.get_client()
+        result = self.analyze_transcripts(client, transcripts)
+        return result, transcripts

app/agents/base.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Any
+from app.llm_client import chat_completion_json, safe_json_loads
+from app.schemas import AgentResult, Finding
+from app.prompts import build_agent_prompt
+from app.video_payload import VideoPayload
+class BaseAgent(ABC):
+    def __init__(self, model: str):
+        self.model = model
+    @abstractmethod
+    def agent_name(self) -> str:
+        raise NotImplementedError
+    def build_messages(self, video_payload: VideoPayload) -> list[dict[str, Any]]:
+        content = [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_payload.value
+                },
+                "fps": video_payload.fps,
+            },
+            {
+                "type": "text",
+                "text": build_agent_prompt(self.agent_name()),
+            },
+        ]
+        return [{"role": "user", "content": content}]
+    def run(self, client, video_payload: VideoPayload) -> AgentResult:
+        messages = self.build_messages(video_payload)
+        raw_text = chat_completion_json(client=client, model=self.model, messages=messages)
+        try:
+            payload = safe_json_loads(raw_text)
+            findings = [Finding(**f) for f in payload.get("findings", [])]
+            clip_level_summary = payload.get("clip_level_summary", "")
+        except Exception:
+            findings = []
+            clip_level_summary = "模型输出解析失败，建议人工复核。"
+        # 这里补齐 clip_start_sec / clip_end_sec
+        # 当前整段视频输入，没有显式分段，因此先统一设为 0
+        # 真正的时间信息仍然以 findings 里的 start_sec / end_sec 为准
+        return AgentResult(
+            agent_name=self.agent_name(),
+            clip_start_sec=0.0,
+            clip_end_sec=0.0,
+            findings=findings,
+            clip_level_summary=clip_level_summary,
+            raw_text=raw_text,
+        )

app/agents/document_agent.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from __future__ import annotations
+from app.llm_client import chat_completion_json, safe_json_loads
+from app.schemas import AgentResult, Finding
+DOCUMENT_AGENT_SYSTEM = """
+你是一个儿童孤独症谱系障碍辅助筛查系统中的“文档智能体”。
+你的任务是分析患者/家属/医生提供的文档材料，包括：
+- 病例资料
+- 孤独症检测表/量表
+- 问卷
+- 观察记录
+- 其他结构化或半结构化文档
+你不是做正式诊断，而是抽取与“五不”辅助筛查相关的文档证据。
+你必须只输出严格 JSON，不要输出 markdown，不要解释。
+""".strip()
+DOCUMENT_AGENT_USER_TEMPLATE = """
+请分析以下文档内容，并输出结构化 JSON。
+要求：
+1. 关注“五不”：
+   - 不（少）看
+   - 不（少）应
+   - 不（少）指
+   - 不（少）语
+   - 不当
+2. 每条 finding 使用以下格式：
+{
+  "warning_type": "不（少）看|不（少）应|不（少）指|不（少）语|不当",
+  "start_sec": 0,
+  "end_sec": 0,
+  "confidence": 0.0,
+  "evidence": "从文档提炼的关键证据",
+  "behavior_tags": ["标签1", "标签2"],
+  "clinical_note": "简短临床解释",
+  "clip_summary": "一句话摘要",
+  "modality_limit": "document_only"
+}
+3. 文档证据没有视频时间戳，因此 start_sec 和 end_sec 固定为 0。
+4. 若某一维度没有足够证据，可以不输出。
+5. 输出格式必须为：
+{
+  "findings": [...],
+  "clip_level_summary": "文档总体摘要"
+}
+文档内容如下：
+<<DOCUMENT_BUNDLE>>
+""".strip()
+class DocumentAgent:
+    def __init__(self, model: str):
+        self.model = model
+    def agent_name(self) -> str:
+        return "document_agent"
+    def run(self, client, document_bundle: str) -> AgentResult:
+        messages = [
+            {"role": "system", "content": DOCUMENT_AGENT_SYSTEM},
+            {
+                "role": "user",
+                "content": DOCUMENT_AGENT_USER_TEMPLATE.replace("<<DOCUMENT_BUNDLE>>", document_bundle),
+            },
+        ]
+        raw_text = chat_completion_json(
+            client=client,
+            model=self.model,
+            messages=messages,
+            temperature=0.1,
+            timeout=180,
+        )
+        try:
+            payload = safe_json_loads(raw_text)
+            findings = [Finding(**f) for f in payload.get("findings", [])]
+            clip_level_summary = payload.get("clip_level_summary", "")
+        except Exception:
+            findings = []
+            clip_level_summary = "文档智能体输出解析失败，建议人工复核。"
+        return AgentResult(
+            agent_name=self.agent_name(),
+            findings=findings,
+            clip_start_sec=0.0,
+            clip_end_sec=0.0,
+            clip_level_summary=clip_level_summary,
+            raw_text=raw_text,
+        )

app/agents/inappropriate_agent.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from app.agents.base import BaseAgent
+class InappropriateAgent(BaseAgent):
+    def agent_name(self) -> str:
+        return "inappropriate_agent"

app/agents/look_agent.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from app.agents.base import BaseAgent
+class LookAgent(BaseAgent):
+    def agent_name(self) -> str:
+        return "look_agent"

app/agents/point_agent.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from app.agents.base import BaseAgent
+class PointAgent(BaseAgent):
+    def agent_name(self) -> str:
+        return "point_agent"

app/agents/respond_agent.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from app.agents.base import BaseAgent
+class RespondAgent(BaseAgent):
+    def agent_name(self) -> str:
+        return "respond_agent"

app/agents/speak_agent.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from app.agents.base import BaseAgent
+class SpeakAgent(BaseAgent):
+    def agent_name(self) -> str:
+        return "speak_agent"

app/agents/timeline_agent.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from app.agents.base import BaseAgent
+class TimelineAgent(BaseAgent):
+    def agent_name(self) -> str:
+        return "timeline_agent"

app/aggregator.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from __future__ import annotations
+from collections import defaultdict
+from app.schemas import AgentResult, MergedEvent
+def merge_overlapping_results(results: list[AgentResult], iou_gap: float = 2.0) -> list[MergedEvent]:
+    grouped = defaultdict(list)
+    for result in results:
+        for finding in result.findings:
+            grouped[finding.warning_type].append((result.agent_name, finding))
+    merged: list[MergedEvent] = []
+    for warning_type, items in grouped.items():
+        items.sort(key=lambda x: x[1].start_sec)
+        current = None
+        for agent_name, f in items:
+            if current is None:
+                current = {
+                    "warning_type": warning_type,
+                    "start_sec": f.start_sec,
+                    "end_sec": f.end_sec,
+                    "confidence_sum": f.confidence,
+                    "count": 1,
+                    "evidences": [f.evidence],
+                    "sources": [agent_name],
+                    "behavior_tags": set(f.behavior_tags),
+                    "clinical_note": f.clinical_note or "",
+                }
+                continue
+            overlap = f.start_sec <= current["end_sec"] + iou_gap
+            if overlap:
+                current["end_sec"] = max(current["end_sec"], f.end_sec)
+                current["confidence_sum"] += f.confidence
+                current["count"] += 1
+                current["evidences"].append(f.evidence)
+                current["sources"].append(agent_name)
+                current["behavior_tags"].update(f.behavior_tags)
+                if len(f.clinical_note) > len(current["clinical_note"]):
+                    current["clinical_note"] = f.clinical_note
+            else:
+                merged.append(
+                    MergedEvent(
+                        warning_type=current["warning_type"],
+                        start_sec=current["start_sec"],
+                        end_sec=current["end_sec"],
+                        confidence=min(1.0, current["confidence_sum"] / current["count"]),
+                        evidences=current["evidences"],
+                        sources=current["sources"],
+                        behavior_tags=sorted(current["behavior_tags"]),
+                        clinical_note=current["clinical_note"],
+                    )
+                )
+                current = {
+                    "warning_type": warning_type,
+                    "start_sec": f.start_sec,
+                    "end_sec": f.end_sec,
+                    "confidence_sum": f.confidence,
+                    "count": 1,
+                    "evidences": [f.evidence],
+                    "sources": [agent_name],
+                    "behavior_tags": set(f.behavior_tags),
+                    "clinical_note": f.clinical_note or "",
+                }
+        if current is not None:
+            merged.append(
+                MergedEvent(
+                    warning_type=current["warning_type"],
+                    start_sec=current["start_sec"],
+                    end_sec=current["end_sec"],
+                    confidence=min(1.0, current["confidence_sum"] / current["count"]),
+                    evidences=current["evidences"],
+                    sources=current["sources"],
+                    behavior_tags=sorted(current["behavior_tags"]),
+                    clinical_note=current["clinical_note"],
+                )
+            )
+    merged.sort(key=lambda x: (x.start_sec, x.warning_type))
+    return merged

app/audio_utils.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from __future__ import annotations
+import base64
+import os
+import shutil
+import subprocess
+from dataclasses import dataclass
+from typing import List
+@dataclass
+class AudioChunk:
+    path: str
+    start_sec: float
+    end_sec: float
+def check_ffmpeg_available() -> bool:
+    return shutil.which("ffmpeg") is not None and shutil.which("ffprobe") is not None
+def ensure_dir(path: str) -> None:
+    os.makedirs(path, exist_ok=True)
+def get_media_duration(path: str) -> float:
+    cmd = [
+        "ffprobe",
+        "-v", "error",
+        "-show_entries", "format=duration",
+        "-of", "default=noprint_wrappers=1:nokey=1",
+        path,
+    ]
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if proc.returncode != 0:
+        raise RuntimeError(f"ffprobe 获取时长失败: {proc.stderr}")
+    return float(proc.stdout.strip())
+def extract_audio_from_video(
+    video_path: str,
+    output_audio_path: str,
+    bitrate: str = "64k",
+) -> str:
+    """
+    从视频中抽取音频，转成 16k 单声道 mp3，便于后续 ASR。
+    """
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"视频不存在: {video_path}")
+    if not check_ffmpeg_available():
+        raise RuntimeError("未检测到 ffmpeg/ffprobe，请先安装 ffmpeg。")
+    ensure_dir(os.path.dirname(output_audio_path))
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i", video_path,
+        "-vn",
+        "-ac", "1",
+        "-ar", "16000",
+        "-c:a", "mp3",
+        "-b:a", bitrate,
+        output_audio_path,
+    ]
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if proc.returncode != 0:
+        raise RuntimeError(f"抽取音频失败:\n{proc.stderr}")
+    return output_audio_path
+def split_audio_to_chunks(
+    audio_path: str,
+    output_dir: str,
+    chunk_seconds: int = 290,
+) -> List[AudioChunk]:
+    """
+    按固定时长切音频，避免超过 qwen3-asr-flash 的单次时长限制。
+    """
+    ensure_dir(output_dir)
+    duration = get_media_duration(audio_path)
+    chunks: List[AudioChunk] = []
+    start = 0.0
+    idx = 0
+    while start < duration:
+        end = min(duration, start + chunk_seconds)
+        chunk_path = os.path.join(output_dir, f"audio_chunk_{idx:03d}.mp3")
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-i", audio_path,
+            "-ss", str(start),
+            "-t", str(end - start),
+            "-acodec", "copy",
+            chunk_path,
+        ]
+        proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if proc.returncode != 0:
+            raise RuntimeError(f"切分音频失败:\n{proc.stderr}")
+        chunks.append(AudioChunk(path=chunk_path, start_sec=start, end_sec=end))
+        start = end
+        idx += 1
+    return chunks
+def audio_file_to_data_uri(audio_path: str, mime_type: str = "audio/mpeg") -> str:
+    with open(audio_path, "rb") as f:
+        b64 = base64.b64encode(f.read()).decode("utf-8")
+    return f"data:{mime_type};base64,{b64}"

app/config.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from dotenv import load_dotenv
+load_dotenv()
+def _clean_env_value(value: str) -> str:
+    value = (value or "").strip()
+    if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
+        value = value[1:-1].strip()
+    return value
+def _to_bool(value: str, default: bool = False) -> bool:
+    if value is None:
+        return default
+    value = _clean_env_value(value).lower()
+    return value in {"1", "true", "yes", "y", "on"}
+@dataclass
+class Settings:
+    api_keys: list[str]
+    base_url: str
+    model: str
+    max_workers: int
+    video_input_mode: str
+    video_mime_type: str
+    video_fps: int
+    output_dir: str
+    enable_rag_final: bool
+    alibaba_cloud_access_key_id: str
+    alibaba_cloud_access_key_secret: str
+    bailian_workspace_id: str
+    bailian_index_id: str
+    bailian_retrieve_topn: int
+    bailian_retrieve_enable_rerank: bool
+    bailian_retrieve_dense_topk: int
+    bailian_retrieve_sparse_topk: int
+    bailian_retrieve_min_score: float
+    enable_video_preprocess: bool
+    video_preprocess_mode: str
+    video_preprocess_remove_audio: bool
+    preprocessed_video_dir: str
+    # 新增：音频智能体
+    enable_audio_agent: bool
+    audio_asr_model: str
+    audio_chunk_seconds: int
+    extracted_audio_dir: str
+    @staticmethod
+    def load() -> "Settings":
+        raw_keys = _clean_env_value(os.getenv("DASHSCOPE_API_KEYS", ""))
+        api_keys = []
+        for item in raw_keys.split(","):
+            k = _clean_env_value(item)
+            if k:
+                api_keys.append(k)
+        if not api_keys:
+            raise ValueError("DASHSCOPE_API_KEYS 为空，请在 .env 中配置至少一个有效 API Key。")
+        video_input_mode = _clean_env_value(os.getenv("VIDEO_INPUT_MODE", "base64")).lower()
+        if video_input_mode not in {"base64", "remote_url"}:
+            raise ValueError("VIDEO_INPUT_MODE 只能是 base64 或 remote_url。")
+        enable_rag_final = _to_bool(os.getenv("ENABLE_RAG_FINAL", "false"))
+        settings = Settings(
+            api_keys=api_keys,
+            base_url=_clean_env_value(os.getenv("DASHSCOPE_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")),
+            model=_clean_env_value(os.getenv("QWEN_MODEL", "qwen3.5-plus")),
+            max_workers=int(_clean_env_value(os.getenv("MAX_WORKERS", "6"))),
+            video_input_mode=video_input_mode,
+            video_mime_type=_clean_env_value(os.getenv("VIDEO_MIME_TYPE", "video/mp4")),
+            video_fps=int(_clean_env_value(os.getenv("VIDEO_FPS", "2"))),
+            output_dir=_clean_env_value(os.getenv("OUTPUT_DIR", "outputs")),
+            enable_rag_final=enable_rag_final,
+            alibaba_cloud_access_key_id=_clean_env_value(os.getenv("ALIBABA_CLOUD_ACCESS_KEY_ID", "")),
+            alibaba_cloud_access_key_secret=_clean_env_value(os.getenv("ALIBABA_CLOUD_ACCESS_KEY_SECRET", "")),
+            bailian_workspace_id=_clean_env_value(os.getenv("BAILIAN_WORKSPACE_ID", "")),
+            bailian_index_id=_clean_env_value(os.getenv("BAILIAN_INDEX_ID", "")),
+            bailian_retrieve_topn=int(_clean_env_value(os.getenv("BAILIAN_RETRIEVE_TOPN", "6"))),
+            bailian_retrieve_enable_rerank=_to_bool(os.getenv("BAILIAN_RETRIEVE_ENABLE_RERANK", "true")),
+            bailian_retrieve_dense_topk=int(_clean_env_value(os.getenv("BAILIAN_RETRIEVE_DENSE_TOPK", "20"))),
+            bailian_retrieve_sparse_topk=int(_clean_env_value(os.getenv("BAILIAN_RETRIEVE_SPARSE_TOPK", "20"))),
+            bailian_retrieve_min_score=float(_clean_env_value(os.getenv("BAILIAN_RETRIEVE_MIN_SCORE", "0.15"))),
+            enable_video_preprocess=_to_bool(os.getenv("ENABLE_VIDEO_PREPROCESS", "true")),
+            video_preprocess_mode=_clean_env_value(os.getenv("VIDEO_PREPROCESS_MODE", "analysis")),
+            video_preprocess_remove_audio=_to_bool(os.getenv("VIDEO_PREPROCESS_REMOVE_AUDIO", "false")),
+            preprocessed_video_dir=_clean_env_value(os.getenv("PREPROCESSED_VIDEO_DIR", "preprocessed_videos")),
+            enable_audio_agent=_to_bool(os.getenv("ENABLE_AUDIO_AGENT", "true")),
+            audio_asr_model=_clean_env_value(os.getenv("AUDIO_ASR_MODEL", "qwen3-asr-flash")),
+            audio_chunk_seconds=int(_clean_env_value(os.getenv("AUDIO_CHUNK_SECONDS", "290"))),
+            extracted_audio_dir=_clean_env_value(os.getenv("EXTRACTED_AUDIO_DIR", "extracted_audio")),
+        )
+        if settings.enable_rag_final:
+            missing = []
+            if not settings.alibaba_cloud_access_key_id:
+                missing.append("ALIBABA_CLOUD_ACCESS_KEY_ID")
+            if not settings.alibaba_cloud_access_key_secret:
+                missing.append("ALIBABA_CLOUD_ACCESS_KEY_SECRET")
+            if not settings.bailian_workspace_id:
+                missing.append("BAILIAN_WORKSPACE_ID")
+            if not settings.bailian_index_id:
+                missing.append("BAILIAN_INDEX_ID")
+            if missing:
+                raise ValueError(
+                    "ENABLE_RAG_FINAL=true 时，以下环境变量必须配置：" + ", ".join(missing)
+                )
+        return settings