Spaces:

Mentosyevsky
/

EthosAI

Sleeping

App Files Files Community

EthosAI / video.py

Mentosyevsky

Update video.py

421c641 verified 11 months ago

raw

history blame contribute delete

4.6 kB

	# -- coding: utf-8 --
	"""video.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1d-uwY0B5q7gOItN6fyA-1RoBcDwkOxb6
	"""
	# ✅ 全局加载模型一次（推荐）
	from faster_whisper import WhisperModel
	whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")

	# 将整段video.py封装为analyze_video函数
	def analyze_video(file, lang=None):
	output = "" # 用于收集结果文本

	import os
	import zipfile
	from dotenv import load_dotenv
	from ultralytics import YOLO
	import cv2
	import requests
	import base64
	import json

	# === 配置 ===
	VIDEO_PATH = file
	YOLO_MODEL_PATH = "trained_dataset/runs/detect/train/weights/best.pt"
	# ✅ 自动解压模型文件（如果还没解压）
	if not os.path.exists("trained_dataset"):
	print("📦 正在解压 YOLO 模型文件...")
	with zipfile.ZipFile("trained_dataset.zip", "r") as zip_ref:
	zip_ref.extractall(".")
	print("✅ 解压完成")

	# ✅ 加载环境变量
	from dotenv import load_dotenv
	load_dotenv()
	API_KEY = os.getenv("GOOGLE_VISION_API_KEY")
	VISION_API_URL = f"https://vision.googleapis.com/v1/images:annotate?key={API_KEY}"
	LIKELIHOOD_MAPPING = {"UNKNOWN": 0, "VERY_UNLIKELY": 1, "UNLIKELY": 2, "POSSIBLE": 3, "LIKELY": 4, "VERY_LIKELY": 5}

	# === 初始化 ===
	model = YOLO(YOLO_MODEL_PATH)
	cap = cv2.VideoCapture(VIDEO_PATH)

	safe_search_results = {"adult": 0, "spoof": 0, "medical": 0, "violence": 0, "racy": 0}
	detected_texts = set()
	detected_labels = set()
	total_frames = 0

	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break
	total_frames += 1

	# YOLO 检测
	results = model.predict(source=frame, conf=0.2, save=False, verbose=False)
	if len(results[0].boxes) > 0:
	for c in results[0].boxes.cls:
	class_id = int(c.item())
	class_name = results[0].names[class_id]
	detected_labels.add(class_name)

	# 每 10 帧 Google 内容分析
	if total_frames % 10 == 0:
	_, buffer = cv2.imencode('.jpg', frame)
	img_base64 = base64.b64encode(buffer).decode()
	payload = {
	"requests": [{
	"image": {"content": img_base64},
	"features": [
	{"type": "SAFE_SEARCH_DETECTION"},
	{"type": "TEXT_DETECTION"}
	]
	}]
	}
	response = requests.post(VISION_API_URL, json=payload)
	result = response.json()

	if "responses" in result and len(result["responses"]) > 0:
	safe = result["responses"][0].get("safeSearchAnnotation", {})
	for key in safe_search_results.keys():
	if LIKELIHOOD_MAPPING.get(safe.get(key, "UNKNOWN"), 0) >= 4:
	safe_search_results[key] += 1

	texts = result["responses"][0].get("textAnnotations", [])
	for text in texts:
	detected_texts.add(text["description"])

	cap.release()

	# === Whisper 音频转文字 ===
	output += "\n🎙 正在转录音频文本...:\n"

	segments, info = whisper_model.transcribe(VIDEO_PATH, language=lang if lang != "auto" else None)
	transcribed_text = "".join([seg.text for seg in segments])

	# === 输出结果 ===
	output += "\n🎯 YOLO检测到的图案类别:\n"
	if detected_labels:
	output += "✅ 检测到图案：" + "、".join(detected_labels) + "\n"
	else:
	output += "❌ 未检测到任何图案\n"

	def risk_level(count, total):
	if count > total * 0.05:
	return "⛔️ 高风险"
	elif count > 5:
	return "⚠️ 中等风险"
	else:
	return "✅ 低风险"

	output += "\n🔎 内容安全风险分析（每类满足可能性 ≥ LIKELY 的帧计数）:\n"
	for k, v in safe_search_results.items():
	output += f"{k.capitalize():<10}: {risk_level(v, total_frames)}（{v} 帧）\n"

	output += "\n📝 视觉文字识别（OCR）:\n"
	output += " ".join(detected_texts) + "\n" if detected_texts else "无可识别文字\n"

	output += "\n🔊 Whisper语音识别结果:\n"
	output += transcribed_text + "\n" if transcribed_text.strip() else "无有效语音\n"

	return output