Spaces:

ayloll
/

video-analyzer

Sleeping

App Files Files Community

video-analyzer / app.py

ayloll

Update app.py

e35d070 verified 9 months ago

raw

history blame contribute delete

5.4 kB

	import gradio as gr
	import whisper
	import yt_dlp
	from transformers import pipeline
	import tempfile
	import os
	import json

	# Cache models globally
	MODEL = None
	CLASSIFIER = None

	def load_models():
	global MODEL, CLASSIFIER
	if MODEL is None:
	print("Loading models...")
	MODEL = whisper.load_model("base")
	CLASSIFIER = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	return MODEL, CLASSIFIER

	def convert_cookies_to_single_line():
	"""Utility function to convert cookies.txt to single-line format"""
	try:
	with open("cookies.txt") as f:
	single_line = f.read().replace("\n", "\\n")
	print("Copy this to Hugging Face Secrets (YOUTUBE_COOKIES_TXT):")
	print(single_line)
	return single_line
	except FileNotFoundError:
	print("Error: cookies.txt file not found")
	return None

	def setup_cookies():
	"""Handle cookies from environment variable"""
	cookies_txt = os.getenv('YOUTUBE_COOKIES_TXT')
	if not cookies_txt:
	return False

	with open('cookies.txt', 'w') as f:
	f.write(cookies_txt.replace("\\n", "\n"))
	return True

	def normalize_youtube_url(url):
	"""Convert various YouTube URL formats to standard watch URL"""
	url = url.strip()

	# Handle youtu.be short links
	if 'youtu.be' in url.lower():
	video_id = url.split('/')[-1].split('?')[0]
	return f'https://www.youtube.com/watch?v={video_id}'

	# Ensure URL is in standard format
	if 'youtube.com/watch' not in url.lower():
	return None

	return url.split('&')[0] # Remove any extra parameters

	def analyze_video(yt_url):
	try:
	# Normalize and validate URL
	normalized_url = normalize_youtube_url(yt_url)
	if not normalized_url:
	return "Error: Invalid YouTube URL. Must be from youtube.com or youtu.be", "", 0

	model, classifier = load_models()
	has_cookies = setup_cookies()

	with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
	tmp_path = tmp.name

	try:
	ydl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': tmp_path,
	'quiet': True,
	'extract_audio': True,
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '192',
	}],
	'http_headers': {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
	'Accept-Language': 'en-US,en;q=0.9',
	'Referer': 'https://www.youtube.com/'
	},
	'socket_timeout': 30,
	'noplaylist': True,
	'verbose': False
	}

	if has_cookies:
	ydl_opts.update({
	'cookiefile': 'cookies.txt',
	'extract_flat': 'in_playlist',
	})

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	try:
	info = ydl.extract_info(normalized_url, download=False)
	if not info.get('url') and not info.get('requested_downloads'):
	return "Error: Failed to extract video info. Cookies may be invalid.", "", 0

	ydl.download([normalized_url])
	except yt_dlp.utils.DownloadError as e:
	if "Sign in to confirm you're not a bot" in str(e):
	return "Error: YouTube requires authentication. Please ensure cookies are fresh and valid.", "", 0
	raise e

	result = model.transcribe(tmp_path)
	transcription = result["text"]

	labels = ["educational", "entertainment", "news", "political", "religious", "technical"]
	classification = classifier(
	transcription,
	candidate_labels=labels,
	hypothesis_template="This content is about {}."
	)

	return transcription, classification["labels"][0], round(classification["scores"][0], 3)

	finally:
	for f in [tmp_path, 'cookies.txt']:
	if os.path.exists(f):
	os.remove(f)

	except Exception as e:
	return f"Error: {str(e)}", "", 0

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🎬 YouTube Content Analyzer")
	with gr.Row():
	url = gr.Textbox(label="YouTube URL",
	placeholder="https://www.youtube.com/watch?v=... or https://youtu.be/...")
	btn = gr.Button("Analyze", variant="primary")
	with gr.Row():
	transcription = gr.Textbox(label="Transcription", interactive=False, lines=5)
	with gr.Column():
	label = gr.Label(label="Category")
	confidence = gr.Number(label="Confidence Score", precision=2)
	btn.click(analyze_video, inputs=url, outputs=[transcription, label, confidence])

	if __name__ == "__main__":
	if os.path.exists("cookies.txt"):
	convert_cookies_to_single_line()
	demo.launch()