File size: 4,603 Bytes
8f4216f
 
 
 
 
 
 
 
863a20f
 
421c641
8f4216f
6dca484
0dc317f
6dca484
 
 
 
 
 
 
 
 
 
 
 
bd43f58
6dca484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863a20f
 
 
6dca484
 
 
 
 
8f4216f
6dca484
 
 
 
 
 
 
 
 
863a20f
6dca484
 
 
 
 
 
 
 
863a20f
 
6dca484
8f4216f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
"""video.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1d-uwY0B5q7gOItN6fyA-1RoBcDwkOxb6
"""
# ✅ 全局加载模型一次(推荐)
from faster_whisper import WhisperModel
whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")

# 将整段video.py封装为analyze_video函数
def analyze_video(file, lang=None):
    output = ""  # 用于收集结果文本
    
    import os
    import zipfile
    from dotenv import load_dotenv
    from ultralytics import YOLO
    import cv2
    import requests
    import base64
    import json
    
    # === 配置 ===
    VIDEO_PATH = file
    YOLO_MODEL_PATH = "trained_dataset/runs/detect/train/weights/best.pt"
    # ✅ 自动解压模型文件(如果还没解压)
    if not os.path.exists("trained_dataset"):
        print("📦 正在解压 YOLO 模型文件...")
        with zipfile.ZipFile("trained_dataset.zip", "r") as zip_ref:
            zip_ref.extractall(".")
        print("✅ 解压完成")
    
    # ✅ 加载环境变量
    from dotenv import load_dotenv
    load_dotenv()
    API_KEY = os.getenv("GOOGLE_VISION_API_KEY")
    VISION_API_URL = f"https://vision.googleapis.com/v1/images:annotate?key={API_KEY}"
    LIKELIHOOD_MAPPING = {"UNKNOWN": 0, "VERY_UNLIKELY": 1, "UNLIKELY": 2, "POSSIBLE": 3, "LIKELY": 4, "VERY_LIKELY": 5}
    
    # === 初始化 ===      
    model = YOLO(YOLO_MODEL_PATH)
    cap = cv2.VideoCapture(VIDEO_PATH)
    
    safe_search_results = {"adult": 0, "spoof": 0, "medical": 0, "violence": 0, "racy": 0}
    detected_texts = set()
    detected_labels = set()
    total_frames = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        total_frames += 1
    
        # YOLO 检测
        results = model.predict(source=frame, conf=0.2, save=False, verbose=False)
        if len(results[0].boxes) > 0:
            for c in results[0].boxes.cls:
                class_id = int(c.item())
                class_name = results[0].names[class_id]
                detected_labels.add(class_name)
    
        # 每 10 帧 Google 内容分析
        if total_frames % 10 == 0:
            _, buffer = cv2.imencode('.jpg', frame)
            img_base64 = base64.b64encode(buffer).decode()
            payload = {
                "requests": [{
                    "image": {"content": img_base64},
                    "features": [
                        {"type": "SAFE_SEARCH_DETECTION"},
                        {"type": "TEXT_DETECTION"}
                    ]
                }]
            }
            response = requests.post(VISION_API_URL, json=payload)
            result = response.json()
    
            if "responses" in result and len(result["responses"]) > 0:
                safe = result["responses"][0].get("safeSearchAnnotation", {})
                for key in safe_search_results.keys():
                    if LIKELIHOOD_MAPPING.get(safe.get(key, "UNKNOWN"), 0) >= 4:
                        safe_search_results[key] += 1
    
                texts = result["responses"][0].get("textAnnotations", [])
                for text in texts:
                    detected_texts.add(text["description"])
    
    cap.release()
    
    # === Whisper 音频转文字 ===
    output += "\n🎙 正在转录音频文本...:\n"
    
    segments, info = whisper_model.transcribe(VIDEO_PATH, language=lang if lang != "auto" else None)
    transcribed_text = "".join([seg.text for seg in segments])
    
    # === 输出结果 ===
    output += "\n🎯 YOLO检测到的图案类别:\n"
    if detected_labels:
        output += "✅ 检测到图案:" + "、".join(detected_labels) + "\n"
    else:
        output += "❌ 未检测到任何图案\n"
    
    def risk_level(count, total):
        if count > total * 0.05:
            return "⛔️ 高风险"
        elif count > 5:
            return "⚠️ 中等风险"
        else:
            return "✅ 低风险"
    
    output += "\n🔎 内容安全风险分析(每类满足可能性 ≥ LIKELY 的帧计数):\n"
    for k, v in safe_search_results.items():
        output += f"{k.capitalize():<10}: {risk_level(v, total_frames)}{v} 帧)\n"
    
    output += "\n📝 视觉文字识别(OCR):\n"
    output += " ".join(detected_texts) + "\n" if detected_texts else "无可识别文字\n"
    
    output += "\n🔊 Whisper语音识别结果:\n"
    output += transcribed_text + "\n" if transcribed_text.strip() else "无有效语音\n"
    
    return output