Spaces:

mohammed777
/

youtube-analyzer

Paused

App Files Files Community

youtube-analyzer / main.py

mohammed777

Update main.py

2b3094e verified 6 months ago

raw

history blame contribute delete

11.7 kB

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	import pickle
	import pandas as pd
	import numpy as np
	import yt_dlp
	from youtube_comment_downloader import YoutubeCommentDownloader
	from datetime import datetime
	import re
	import string
	import nltk
	import emoji
	from urllib.parse import urlparse, parse_qs
	from nltk.corpus import stopwords
	import logging
	from concurrent.futures import ThreadPoolExecutor
	from fake_useragent import UserAgent
	import random
	import io
	import base64
	import math
	import time

	# --- إعدادات NLTK و Logging ---
	nltk.data.path.append('/app/nltk_data')
	try:
	nltk.download('stopwords', quiet=True)
	except Exception as e:
	logging.error(f"Failed to download NLTK stopwords: {e}")

	arabic_stopwords = set(stopwords.words('arabic'))

	# --- إعداد التسجيل ---
	logging.basicConfig(
	filename='youtube_scraper.log',
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)

	app = FastAPI()

	# تحميل النماذج
	loaded_quality_model = None
	loaded_sentiment_pipeline = None

	try:
	with open('models/final_youtube_quality_model.pkl', 'rb') as f:
	loaded_quality_model = pickle.load(f)
	logging.info("تم تحميل نموذج جودة الفيديو بنجاح.")
	except FileNotFoundError:
	logging.error("خطأ: لم يتم العثور على ملف النموذج 'final_youtube_quality_model.pkl'.")
	except Exception as e:
	logging.error(f"خطأ غير متوقع أثناء تحميل نموذج جودة الفيديو: {e}")

	try:
	with open('models/best_sentiment_pipeline.pkl', 'rb') as f:
	loaded_sentiment_pipeline = pickle.load(f)
	logging.info("تم تحميل نموذج تصنيف المشاعر بنجاح.")
	except FileNotFoundError:
	logging.error("خطأ: لم يتم العثور على ملف النموذج 'best_sentiment_pipeline.pkl'.")
	except Exception as e:
	logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}")

	# --- إعدادات User-Agent لـ yt-dlp ---
	ua = UserAgent()

	def get_desktop_user_agent():
	while True:
	candidate = random.choice([ua.chrome, ua.firefox, ua.safari])
	if all(x not in candidate for x in ['Mobile', 'Android', 'iPhone', 'iPad']):
	return candidate

	selected_user_agent = get_desktop_user_agent()

	headers = {'User-Agent': selected_user_agent}

	# تحديد مسار ملف الكوكيز داخل حاوية Docker
	COOKIES_FILE_PATH = 'cookies.txt'

	ydl_opts_video_info = {
	'quiet': True,
	'skip_download': True,
	'extract_flat': True,
	'ignoreerrors': True,
	'no_warnings': True,
	'age_limit': 18,
	'force_generic_extractor': False,
	'http_headers': headers,
	'cookiefile': COOKIES_FILE_PATH # سيبقى هذا الخيار لـ yt-dlp
	}

	# --- النموذج ---
	class PlaylistRequest(BaseModel):
	playlist_url: str

	# --- دالة استخراج ID الفيديو ---
	def extract_video_id(url):
	if 'youtu.be/' in url:
	return url.split('/')[-1].split('?')[0]
	elif 'watch?v=' in url:
	return parse_qs(urlparse(url).query).get('v', [None])[0]
	return None

	# --- تنظيف التعليقات ---
	def preprocess_text(text):
	if not isinstance(text, str):
	return ""
	text = emoji.demojize(text)
	text = re.sub(r'http\S+', '', text)
	text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
	text = text.lower()
	text = re.sub(r'\s+', ' ', text).strip()
	text_tokens = text.split()
	filtered_text = [word for word in text_tokens if word not in arabic_stopwords]
	return ' '.join(filtered_text)


	# --- معالجة فيديو واحد فقط (نسخة 2) ---
	def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50):
	# تم حذف cookies_file من هنا
	downloader = YoutubeCommentDownloader()

	video_id = extract_video_id(video_url)
	if not video_id:
	logging.warning(f"رابط فيديو غير صالح: {video_url}. تم تجاهله.")
	return None

	try:
	time.sleep(random.uniform(1, 3))

	with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl:
	info_dict = ydl.extract_info(video_url, download=False)

	logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}")

	if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0:
	logging.warning(f"لا يمكن معالجة الفيديو {video_id}: مباشر أو مقيد عمرًا. تم تجاهله.")
	return None

	views = info_dict.get('view_count', 0)
	likes = info_dict.get('like_count', 0)

	logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}")

	upload_date = info_dict.get('upload_date', 'Unknown')
	publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year

	time.sleep(random.uniform(1, 3))

	sampled_comments = []
	try:
	for comment in downloader.get_comments_from_url(video_url):
	if 'text' in comment:
	sampled_comments.append(comment['text'])
	if len(sampled_comments) >= max_comments_per_video:
	break
	except Exception as e:
	logging.warning(f"فشل في جلب التعليقات للفيديو {video_id}. السبب: {e}.")
	sampled_comments = []

	like_view_ratio = likes / views if views > 0 else 0.0
	comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0
	engagement_score = like_view_ratio + comment_view_ratio

	positive_comments = 0
	negative_comments = 0
	overall_sentiment = "لا توجد تعليقات كافية"

	if sampled_comments:
	processed_comments = [preprocess_text(c) for c in sampled_comments]
	sentiment_predictions = loaded_sentiment_pipeline.predict(processed_comments)
	positive_comments = np.sum(sentiment_predictions == 1)
	negative_comments = np.sum(sentiment_predictions == 0)

	if positive_comments > negative_comments:
	overall_sentiment = "إيجابي"
	elif negative_comments > positive_comments:
	overall_sentiment = "سلبي"
	else:
	overall_sentiment = "محايد"

	input_df = pd.DataFrame([[views, likes, len(sampled_comments), 0, publish_year,
	like_view_ratio, comment_view_ratio, engagement_score]],
	columns=['views_count', 'likes_count', 'comments_count',
	'video_duration_seconds', 'publish_year',
	'like_view_ratio', 'comment_view_ratio', 'engagement_score'])

	playlist_quality = "لم يتم التقييم"
	try:
	prediction_numeric = loaded_quality_model.predict(input_df)[0]
	logging.info(f"[فيديو: {video_url}] نتيجة التنبؤ: {prediction_numeric}")
	playlist_quality = "جيد" if prediction_numeric == 1 else "سيء"
	except Exception as e:
	playlist_quality = f"خطأ في التقييم: {e}"
	logging.error(f"[فيديو: {video_url}] خطأ في تقييم الفيديو: {e}")

	return {
	"video_url": video_url,
	"views": views,
	"likes": likes,
	"comments": len(sampled_comments),
	"like_view_ratio": like_view_ratio,
	"comment_view_ratio": comment_view_ratio,
	"engagement_score": engagement_score,
	"quality": playlist_quality,
	"sentiment": overall_sentiment,
	"positive_comments": positive_comments,
	"negative_comments": negative_comments
	}

	except Exception as e:
	logging.error(f"حدث خطأ في الفيديو {video_url}: {e}")
	return None

	@app.post("/evaluate_youtube_playlist_individually_same_method2/")
	async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str, max_comments_per_video: int = 50, max_workers: int = 3):
	"""
	تقييم قائمة تشغيل يوتيوب باستخدام نظام مركب (نسخة 2)
	"""

	if loaded_quality_model is None or loaded_sentiment_pipeline is None:
	logging.error("لم يتم تحميل النماذج المطلوبة.")
	return {"error": "لم يتم تحميل النماذج المطلوبة."}

	video_links = []
	try:
	# خيار cookiefile لـ yt-dlp يبقى هنا
	with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True, 'playlist_items': '1:10', 'cookiefile': COOKIES_FILE_PATH}) as ydl:
	playlist_info = ydl.extract_info(youtube_url, download=False)
	if 'entries' in playlist_info:
	for entry in playlist_info['entries'][:10]:
	if entry and 'url' in entry:
	video_links.append(entry['url'])
	else:
	logging.warning("لا توجد فيديوهات في هذه القائمة.")
	return {"error": "لا توجد فيديوهات في هذه القائمة."}
	except Exception as e:
	logging.error(f"فشل في جلب روابط الفيديو: {e}")
	return {"error": f"فشل في جلب روابط الفيديو: {e}"}

	individual_results = []

	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	futures = [
	executor.submit(
	process_single_video2,
	video_url,
	loaded_quality_model,
	loaded_sentiment_pipeline,
	max_comments_per_video
	) for video_url in video_links
	]

	for future in futures:
	result = future.result()
	if result:
	individual_results.append(result)
	time.sleep(random.uniform(0.5, 1.5))

	num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد')
	total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r)
	total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r)
	total_classified_comments = total_positive_comments + total_negative_comments

	total_videos = len(individual_results)
	percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0

	if percent_good_videos >= 70:
	overall_quality = "جيد جداً"
	elif percent_good_videos >= 50:
	overall_quality = "جيد"
	else:
	overall_quality = "سيء"

	WEIGHT_QUALITY = 0.6
	WEIGHT_SENTIMENT = 0.4

	positive_ratio = (total_positive_comments / total_classified_comments) * 100 if total_classified_comments > 0 else 0.0
	composite_score = (WEIGHT_QUALITY * percent_good_videos) + (WEIGHT_SENTIMENT * positive_ratio)

	if composite_score >= 75:
	composite_quality = "ممتاز"
	elif composite_score >= 60:
	composite_quality = "جيد"
	else:
	composite_quality = "ضعيف"

	return {
	"overall_quality": overall_quality,
	"composite_quality": composite_quality,
	"composite_score": round(composite_score, 1),
	"percent_good_videos": round(percent_good_videos, 1),
	"positive_ratio": round(positive_ratio, 1),
	"negative_ratio": round(100 - positive_ratio, 1),
	}