youtube-analyzer / main.py
mohammed777's picture
Update main.py
2b3094e verified
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pickle
import pandas as pd
import numpy as np
import yt_dlp
from youtube_comment_downloader import YoutubeCommentDownloader
from datetime import datetime
import re
import string
import nltk
import emoji
from urllib.parse import urlparse, parse_qs
from nltk.corpus import stopwords
import logging
from concurrent.futures import ThreadPoolExecutor
from fake_useragent import UserAgent
import random
import io
import base64
import math
import time
# --- إعدادات NLTK و Logging ---
nltk.data.path.append('/app/nltk_data')
try:
nltk.download('stopwords', quiet=True)
except Exception as e:
logging.error(f"Failed to download NLTK stopwords: {e}")
arabic_stopwords = set(stopwords.words('arabic'))
# --- إعداد التسجيل ---
logging.basicConfig(
filename='youtube_scraper.log',
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
app = FastAPI()
# تحميل النماذج
loaded_quality_model = None
loaded_sentiment_pipeline = None
try:
with open('models/final_youtube_quality_model.pkl', 'rb') as f:
loaded_quality_model = pickle.load(f)
logging.info("تم تحميل نموذج جودة الفيديو بنجاح.")
except FileNotFoundError:
logging.error("خطأ: لم يتم العثور على ملف النموذج 'final_youtube_quality_model.pkl'.")
except Exception as e:
logging.error(f"خطأ غير متوقع أثناء تحميل نموذج جودة الفيديو: {e}")
try:
with open('models/best_sentiment_pipeline.pkl', 'rb') as f:
loaded_sentiment_pipeline = pickle.load(f)
logging.info("تم تحميل نموذج تصنيف المشاعر بنجاح.")
except FileNotFoundError:
logging.error("خطأ: لم يتم العثور على ملف النموذج 'best_sentiment_pipeline.pkl'.")
except Exception as e:
logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}")
# --- إعدادات User-Agent لـ yt-dlp ---
ua = UserAgent()
def get_desktop_user_agent():
while True:
candidate = random.choice([ua.chrome, ua.firefox, ua.safari])
if all(x not in candidate for x in ['Mobile', 'Android', 'iPhone', 'iPad']):
return candidate
selected_user_agent = get_desktop_user_agent()
headers = {'User-Agent': selected_user_agent}
# تحديد مسار ملف الكوكيز داخل حاوية Docker
COOKIES_FILE_PATH = 'cookies.txt'
ydl_opts_video_info = {
'quiet': True,
'skip_download': True,
'extract_flat': True,
'ignoreerrors': True,
'no_warnings': True,
'age_limit': 18,
'force_generic_extractor': False,
'http_headers': headers,
'cookiefile': COOKIES_FILE_PATH # سيبقى هذا الخيار لـ yt-dlp
}
# --- النموذج ---
class PlaylistRequest(BaseModel):
playlist_url: str
# --- دالة استخراج ID الفيديو ---
def extract_video_id(url):
if 'youtu.be/' in url:
return url.split('/')[-1].split('?')[0]
elif 'watch?v=' in url:
return parse_qs(urlparse(url).query).get('v', [None])[0]
return None
# --- تنظيف التعليقات ---
def preprocess_text(text):
if not isinstance(text, str):
return ""
text = emoji.demojize(text)
text = re.sub(r'http\S+', '', text)
text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
text = text.lower()
text = re.sub(r'\s+', ' ', text).strip()
text_tokens = text.split()
filtered_text = [word for word in text_tokens if word not in arabic_stopwords]
return ' '.join(filtered_text)
# --- معالجة فيديو واحد فقط (نسخة 2) ---
def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50):
# تم حذف cookies_file من هنا
downloader = YoutubeCommentDownloader()
video_id = extract_video_id(video_url)
if not video_id:
logging.warning(f"رابط فيديو غير صالح: {video_url}. تم تجاهله.")
return None
try:
time.sleep(random.uniform(1, 3))
with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl:
info_dict = ydl.extract_info(video_url, download=False)
logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}")
if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0:
logging.warning(f"لا يمكن معالجة الفيديو {video_id}: مباشر أو مقيد عمرًا. تم تجاهله.")
return None
views = info_dict.get('view_count', 0)
likes = info_dict.get('like_count', 0)
logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}")
upload_date = info_dict.get('upload_date', 'Unknown')
publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year
time.sleep(random.uniform(1, 3))
sampled_comments = []
try:
for comment in downloader.get_comments_from_url(video_url):
if 'text' in comment:
sampled_comments.append(comment['text'])
if len(sampled_comments) >= max_comments_per_video:
break
except Exception as e:
logging.warning(f"فشل في جلب التعليقات للفيديو {video_id}. السبب: {e}.")
sampled_comments = []
like_view_ratio = likes / views if views > 0 else 0.0
comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0
engagement_score = like_view_ratio + comment_view_ratio
positive_comments = 0
negative_comments = 0
overall_sentiment = "لا توجد تعليقات كافية"
if sampled_comments:
processed_comments = [preprocess_text(c) for c in sampled_comments]
sentiment_predictions = loaded_sentiment_pipeline.predict(processed_comments)
positive_comments = np.sum(sentiment_predictions == 1)
negative_comments = np.sum(sentiment_predictions == 0)
if positive_comments > negative_comments:
overall_sentiment = "إيجابي"
elif negative_comments > positive_comments:
overall_sentiment = "سلبي"
else:
overall_sentiment = "محايد"
input_df = pd.DataFrame([[views, likes, len(sampled_comments), 0, publish_year,
like_view_ratio, comment_view_ratio, engagement_score]],
columns=['views_count', 'likes_count', 'comments_count',
'video_duration_seconds', 'publish_year',
'like_view_ratio', 'comment_view_ratio', 'engagement_score'])
playlist_quality = "لم يتم التقييم"
try:
prediction_numeric = loaded_quality_model.predict(input_df)[0]
logging.info(f"[فيديو: {video_url}] نتيجة التنبؤ: {prediction_numeric}")
playlist_quality = "جيد" if prediction_numeric == 1 else "سيء"
except Exception as e:
playlist_quality = f"خطأ في التقييم: {e}"
logging.error(f"[فيديو: {video_url}] خطأ في تقييم الفيديو: {e}")
return {
"video_url": video_url,
"views": views,
"likes": likes,
"comments": len(sampled_comments),
"like_view_ratio": like_view_ratio,
"comment_view_ratio": comment_view_ratio,
"engagement_score": engagement_score,
"quality": playlist_quality,
"sentiment": overall_sentiment,
"positive_comments": positive_comments,
"negative_comments": negative_comments
}
except Exception as e:
logging.error(f"حدث خطأ في الفيديو {video_url}: {e}")
return None
@app.post("/evaluate_youtube_playlist_individually_same_method2/")
async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str, max_comments_per_video: int = 50, max_workers: int = 3):
"""
تقييم قائمة تشغيل يوتيوب باستخدام نظام مركب (نسخة 2)
"""
if loaded_quality_model is None or loaded_sentiment_pipeline is None:
logging.error("لم يتم تحميل النماذج المطلوبة.")
return {"error": "لم يتم تحميل النماذج المطلوبة."}
video_links = []
try:
# خيار cookiefile لـ yt-dlp يبقى هنا
with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True, 'playlist_items': '1:10', 'cookiefile': COOKIES_FILE_PATH}) as ydl:
playlist_info = ydl.extract_info(youtube_url, download=False)
if 'entries' in playlist_info:
for entry in playlist_info['entries'][:10]:
if entry and 'url' in entry:
video_links.append(entry['url'])
else:
logging.warning("لا توجد فيديوهات في هذه القائمة.")
return {"error": "لا توجد فيديوهات في هذه القائمة."}
except Exception as e:
logging.error(f"فشل في جلب روابط الفيديو: {e}")
return {"error": f"فشل في جلب روابط الفيديو: {e}"}
individual_results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(
process_single_video2,
video_url,
loaded_quality_model,
loaded_sentiment_pipeline,
max_comments_per_video
) for video_url in video_links
]
for future in futures:
result = future.result()
if result:
individual_results.append(result)
time.sleep(random.uniform(0.5, 1.5))
num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد')
total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r)
total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r)
total_classified_comments = total_positive_comments + total_negative_comments
total_videos = len(individual_results)
percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0
if percent_good_videos >= 70:
overall_quality = "جيد جداً"
elif percent_good_videos >= 50:
overall_quality = "جيد"
else:
overall_quality = "سيء"
WEIGHT_QUALITY = 0.6
WEIGHT_SENTIMENT = 0.4
positive_ratio = (total_positive_comments / total_classified_comments) * 100 if total_classified_comments > 0 else 0.0
composite_score = (WEIGHT_QUALITY * percent_good_videos) + (WEIGHT_SENTIMENT * positive_ratio)
if composite_score >= 75:
composite_quality = "ممتاز"
elif composite_score >= 60:
composite_quality = "جيد"
else:
composite_quality = "ضعيف"
return {
"overall_quality": overall_quality,
"composite_quality": composite_quality,
"composite_score": round(composite_score, 1),
"percent_good_videos": round(percent_good_videos, 1),
"positive_ratio": round(positive_ratio, 1),
"negative_ratio": round(100 - positive_ratio, 1),
}