from fastapi import FastAPI, HTTPException from pydantic import BaseModel import pickle import pandas as pd import numpy as np import yt_dlp from youtube_comment_downloader import YoutubeCommentDownloader from datetime import datetime import re import string import nltk import emoji from urllib.parse import urlparse, parse_qs from nltk.corpus import stopwords import logging from concurrent.futures import ThreadPoolExecutor from fake_useragent import UserAgent import random import io import base64 import math import time # --- إعدادات NLTK و Logging --- nltk.data.path.append('/app/nltk_data') try: nltk.download('stopwords', quiet=True) except Exception as e: logging.error(f"Failed to download NLTK stopwords: {e}") arabic_stopwords = set(stopwords.words('arabic')) # --- إعداد التسجيل --- logging.basicConfig( filename='youtube_scraper.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) app = FastAPI() # تحميل النماذج loaded_quality_model = None loaded_sentiment_pipeline = None try: with open('models/final_youtube_quality_model.pkl', 'rb') as f: loaded_quality_model = pickle.load(f) logging.info("تم تحميل نموذج جودة الفيديو بنجاح.") except FileNotFoundError: logging.error("خطأ: لم يتم العثور على ملف النموذج 'final_youtube_quality_model.pkl'.") except Exception as e: logging.error(f"خطأ غير متوقع أثناء تحميل نموذج جودة الفيديو: {e}") try: with open('models/best_sentiment_pipeline.pkl', 'rb') as f: loaded_sentiment_pipeline = pickle.load(f) logging.info("تم تحميل نموذج تصنيف المشاعر بنجاح.") except FileNotFoundError: logging.error("خطأ: لم يتم العثور على ملف النموذج 'best_sentiment_pipeline.pkl'.") except Exception as e: logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}") # --- إعدادات User-Agent لـ yt-dlp --- ua = UserAgent() def get_desktop_user_agent(): while True: candidate = random.choice([ua.chrome, ua.firefox, ua.safari]) if all(x not in candidate for x in ['Mobile', 'Android', 'iPhone', 'iPad']): return candidate selected_user_agent = get_desktop_user_agent() headers = {'User-Agent': selected_user_agent} # تحديد مسار ملف الكوكيز داخل حاوية Docker COOKIES_FILE_PATH = 'cookies.txt' ydl_opts_video_info = { 'quiet': True, 'skip_download': True, 'extract_flat': True, 'ignoreerrors': True, 'no_warnings': True, 'age_limit': 18, 'force_generic_extractor': False, 'http_headers': headers, 'cookiefile': COOKIES_FILE_PATH # سيبقى هذا الخيار لـ yt-dlp } # --- النموذج --- class PlaylistRequest(BaseModel): playlist_url: str # --- دالة استخراج ID الفيديو --- def extract_video_id(url): if 'youtu.be/' in url: return url.split('/')[-1].split('?')[0] elif 'watch?v=' in url: return parse_qs(urlparse(url).query).get('v', [None])[0] return None # --- تنظيف التعليقات --- def preprocess_text(text): if not isinstance(text, str): return "" text = emoji.demojize(text) text = re.sub(r'http\S+', '', text) text = text.translate(str.maketrans('', '', string.punctuation + string.digits)) text = text.lower() text = re.sub(r'\s+', ' ', text).strip() text_tokens = text.split() filtered_text = [word for word in text_tokens if word not in arabic_stopwords] return ' '.join(filtered_text) # --- معالجة فيديو واحد فقط (نسخة 2) --- def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50): # تم حذف cookies_file من هنا downloader = YoutubeCommentDownloader() video_id = extract_video_id(video_url) if not video_id: logging.warning(f"رابط فيديو غير صالح: {video_url}. تم تجاهله.") return None try: time.sleep(random.uniform(1, 3)) with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl: info_dict = ydl.extract_info(video_url, download=False) logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}") if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0: logging.warning(f"لا يمكن معالجة الفيديو {video_id}: مباشر أو مقيد عمرًا. تم تجاهله.") return None views = info_dict.get('view_count', 0) likes = info_dict.get('like_count', 0) logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}") upload_date = info_dict.get('upload_date', 'Unknown') publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year time.sleep(random.uniform(1, 3)) sampled_comments = [] try: for comment in downloader.get_comments_from_url(video_url): if 'text' in comment: sampled_comments.append(comment['text']) if len(sampled_comments) >= max_comments_per_video: break except Exception as e: logging.warning(f"فشل في جلب التعليقات للفيديو {video_id}. السبب: {e}.") sampled_comments = [] like_view_ratio = likes / views if views > 0 else 0.0 comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0 engagement_score = like_view_ratio + comment_view_ratio positive_comments = 0 negative_comments = 0 overall_sentiment = "لا توجد تعليقات كافية" if sampled_comments: processed_comments = [preprocess_text(c) for c in sampled_comments] sentiment_predictions = loaded_sentiment_pipeline.predict(processed_comments) positive_comments = np.sum(sentiment_predictions == 1) negative_comments = np.sum(sentiment_predictions == 0) if positive_comments > negative_comments: overall_sentiment = "إيجابي" elif negative_comments > positive_comments: overall_sentiment = "سلبي" else: overall_sentiment = "محايد" input_df = pd.DataFrame([[views, likes, len(sampled_comments), 0, publish_year, like_view_ratio, comment_view_ratio, engagement_score]], columns=['views_count', 'likes_count', 'comments_count', 'video_duration_seconds', 'publish_year', 'like_view_ratio', 'comment_view_ratio', 'engagement_score']) playlist_quality = "لم يتم التقييم" try: prediction_numeric = loaded_quality_model.predict(input_df)[0] logging.info(f"[فيديو: {video_url}] نتيجة التنبؤ: {prediction_numeric}") playlist_quality = "جيد" if prediction_numeric == 1 else "سيء" except Exception as e: playlist_quality = f"خطأ في التقييم: {e}" logging.error(f"[فيديو: {video_url}] خطأ في تقييم الفيديو: {e}") return { "video_url": video_url, "views": views, "likes": likes, "comments": len(sampled_comments), "like_view_ratio": like_view_ratio, "comment_view_ratio": comment_view_ratio, "engagement_score": engagement_score, "quality": playlist_quality, "sentiment": overall_sentiment, "positive_comments": positive_comments, "negative_comments": negative_comments } except Exception as e: logging.error(f"حدث خطأ في الفيديو {video_url}: {e}") return None @app.post("/evaluate_youtube_playlist_individually_same_method2/") async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str, max_comments_per_video: int = 50, max_workers: int = 3): """ تقييم قائمة تشغيل يوتيوب باستخدام نظام مركب (نسخة 2) """ if loaded_quality_model is None or loaded_sentiment_pipeline is None: logging.error("لم يتم تحميل النماذج المطلوبة.") return {"error": "لم يتم تحميل النماذج المطلوبة."} video_links = [] try: # خيار cookiefile لـ yt-dlp يبقى هنا with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True, 'playlist_items': '1:10', 'cookiefile': COOKIES_FILE_PATH}) as ydl: playlist_info = ydl.extract_info(youtube_url, download=False) if 'entries' in playlist_info: for entry in playlist_info['entries'][:10]: if entry and 'url' in entry: video_links.append(entry['url']) else: logging.warning("لا توجد فيديوهات في هذه القائمة.") return {"error": "لا توجد فيديوهات في هذه القائمة."} except Exception as e: logging.error(f"فشل في جلب روابط الفيديو: {e}") return {"error": f"فشل في جلب روابط الفيديو: {e}"} individual_results = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit( process_single_video2, video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video ) for video_url in video_links ] for future in futures: result = future.result() if result: individual_results.append(result) time.sleep(random.uniform(0.5, 1.5)) num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد') total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r) total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r) total_classified_comments = total_positive_comments + total_negative_comments total_videos = len(individual_results) percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0 if percent_good_videos >= 70: overall_quality = "جيد جداً" elif percent_good_videos >= 50: overall_quality = "جيد" else: overall_quality = "سيء" WEIGHT_QUALITY = 0.6 WEIGHT_SENTIMENT = 0.4 positive_ratio = (total_positive_comments / total_classified_comments) * 100 if total_classified_comments > 0 else 0.0 composite_score = (WEIGHT_QUALITY * percent_good_videos) + (WEIGHT_SENTIMENT * positive_ratio) if composite_score >= 75: composite_quality = "ممتاز" elif composite_score >= 60: composite_quality = "جيد" else: composite_quality = "ضعيف" return { "overall_quality": overall_quality, "composite_quality": composite_quality, "composite_score": round(composite_score, 1), "percent_good_videos": round(percent_good_videos, 1), "positive_ratio": round(positive_ratio, 1), "negative_ratio": round(100 - positive_ratio, 1), }