Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| import pickle | |
| import pandas as pd | |
| import numpy as np | |
| import yt_dlp | |
| from youtube_comment_downloader import YoutubeCommentDownloader | |
| from datetime import datetime | |
| import re | |
| import string | |
| import nltk | |
| import emoji | |
| from urllib.parse import urlparse, parse_qs | |
| from nltk.corpus import stopwords | |
| import logging | |
| from concurrent.futures import ThreadPoolExecutor | |
| from fake_useragent import UserAgent | |
| import random | |
| import io | |
| import base64 | |
| import math | |
| import time | |
| # --- إعدادات NLTK و Logging --- | |
| nltk.data.path.append('/app/nltk_data') | |
| try: | |
| nltk.download('stopwords', quiet=True) | |
| except Exception as e: | |
| logging.error(f"Failed to download NLTK stopwords: {e}") | |
| arabic_stopwords = set(stopwords.words('arabic')) | |
| # --- إعداد التسجيل --- | |
| logging.basicConfig( | |
| filename='youtube_scraper.log', | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| app = FastAPI() | |
| # تحميل النماذج | |
| loaded_quality_model = None | |
| loaded_sentiment_pipeline = None | |
| try: | |
| with open('models/final_youtube_quality_model.pkl', 'rb') as f: | |
| loaded_quality_model = pickle.load(f) | |
| logging.info("تم تحميل نموذج جودة الفيديو بنجاح.") | |
| except FileNotFoundError: | |
| logging.error("خطأ: لم يتم العثور على ملف النموذج 'final_youtube_quality_model.pkl'.") | |
| except Exception as e: | |
| logging.error(f"خطأ غير متوقع أثناء تحميل نموذج جودة الفيديو: {e}") | |
| try: | |
| with open('models/best_sentiment_pipeline.pkl', 'rb') as f: | |
| loaded_sentiment_pipeline = pickle.load(f) | |
| logging.info("تم تحميل نموذج تصنيف المشاعر بنجاح.") | |
| except FileNotFoundError: | |
| logging.error("خطأ: لم يتم العثور على ملف النموذج 'best_sentiment_pipeline.pkl'.") | |
| except Exception as e: | |
| logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}") | |
| # --- إعدادات User-Agent لـ yt-dlp --- | |
| ua = UserAgent() | |
| def get_desktop_user_agent(): | |
| while True: | |
| candidate = random.choice([ua.chrome, ua.firefox, ua.safari]) | |
| if all(x not in candidate for x in ['Mobile', 'Android', 'iPhone', 'iPad']): | |
| return candidate | |
| selected_user_agent = get_desktop_user_agent() | |
| headers = {'User-Agent': selected_user_agent} | |
| # تحديد مسار ملف الكوكيز داخل حاوية Docker | |
| COOKIES_FILE_PATH = 'cookies.txt' | |
| ydl_opts_video_info = { | |
| 'quiet': True, | |
| 'skip_download': True, | |
| 'extract_flat': True, | |
| 'ignoreerrors': True, | |
| 'no_warnings': True, | |
| 'age_limit': 18, | |
| 'force_generic_extractor': False, | |
| 'http_headers': headers, | |
| 'cookiefile': COOKIES_FILE_PATH # سيبقى هذا الخيار لـ yt-dlp | |
| } | |
| # --- النموذج --- | |
| class PlaylistRequest(BaseModel): | |
| playlist_url: str | |
| # --- دالة استخراج ID الفيديو --- | |
| def extract_video_id(url): | |
| if 'youtu.be/' in url: | |
| return url.split('/')[-1].split('?')[0] | |
| elif 'watch?v=' in url: | |
| return parse_qs(urlparse(url).query).get('v', [None])[0] | |
| return None | |
| # --- تنظيف التعليقات --- | |
| def preprocess_text(text): | |
| if not isinstance(text, str): | |
| return "" | |
| text = emoji.demojize(text) | |
| text = re.sub(r'http\S+', '', text) | |
| text = text.translate(str.maketrans('', '', string.punctuation + string.digits)) | |
| text = text.lower() | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| text_tokens = text.split() | |
| filtered_text = [word for word in text_tokens if word not in arabic_stopwords] | |
| return ' '.join(filtered_text) | |
| # --- معالجة فيديو واحد فقط (نسخة 2) --- | |
| def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50): | |
| # تم حذف cookies_file من هنا | |
| downloader = YoutubeCommentDownloader() | |
| video_id = extract_video_id(video_url) | |
| if not video_id: | |
| logging.warning(f"رابط فيديو غير صالح: {video_url}. تم تجاهله.") | |
| return None | |
| try: | |
| time.sleep(random.uniform(1, 3)) | |
| with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl: | |
| info_dict = ydl.extract_info(video_url, download=False) | |
| logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}") | |
| if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0: | |
| logging.warning(f"لا يمكن معالجة الفيديو {video_id}: مباشر أو مقيد عمرًا. تم تجاهله.") | |
| return None | |
| views = info_dict.get('view_count', 0) | |
| likes = info_dict.get('like_count', 0) | |
| logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}") | |
| upload_date = info_dict.get('upload_date', 'Unknown') | |
| publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year | |
| time.sleep(random.uniform(1, 3)) | |
| sampled_comments = [] | |
| try: | |
| for comment in downloader.get_comments_from_url(video_url): | |
| if 'text' in comment: | |
| sampled_comments.append(comment['text']) | |
| if len(sampled_comments) >= max_comments_per_video: | |
| break | |
| except Exception as e: | |
| logging.warning(f"فشل في جلب التعليقات للفيديو {video_id}. السبب: {e}.") | |
| sampled_comments = [] | |
| like_view_ratio = likes / views if views > 0 else 0.0 | |
| comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0 | |
| engagement_score = like_view_ratio + comment_view_ratio | |
| positive_comments = 0 | |
| negative_comments = 0 | |
| overall_sentiment = "لا توجد تعليقات كافية" | |
| if sampled_comments: | |
| processed_comments = [preprocess_text(c) for c in sampled_comments] | |
| sentiment_predictions = loaded_sentiment_pipeline.predict(processed_comments) | |
| positive_comments = np.sum(sentiment_predictions == 1) | |
| negative_comments = np.sum(sentiment_predictions == 0) | |
| if positive_comments > negative_comments: | |
| overall_sentiment = "إيجابي" | |
| elif negative_comments > positive_comments: | |
| overall_sentiment = "سلبي" | |
| else: | |
| overall_sentiment = "محايد" | |
| input_df = pd.DataFrame([[views, likes, len(sampled_comments), 0, publish_year, | |
| like_view_ratio, comment_view_ratio, engagement_score]], | |
| columns=['views_count', 'likes_count', 'comments_count', | |
| 'video_duration_seconds', 'publish_year', | |
| 'like_view_ratio', 'comment_view_ratio', 'engagement_score']) | |
| playlist_quality = "لم يتم التقييم" | |
| try: | |
| prediction_numeric = loaded_quality_model.predict(input_df)[0] | |
| logging.info(f"[فيديو: {video_url}] نتيجة التنبؤ: {prediction_numeric}") | |
| playlist_quality = "جيد" if prediction_numeric == 1 else "سيء" | |
| except Exception as e: | |
| playlist_quality = f"خطأ في التقييم: {e}" | |
| logging.error(f"[فيديو: {video_url}] خطأ في تقييم الفيديو: {e}") | |
| return { | |
| "video_url": video_url, | |
| "views": views, | |
| "likes": likes, | |
| "comments": len(sampled_comments), | |
| "like_view_ratio": like_view_ratio, | |
| "comment_view_ratio": comment_view_ratio, | |
| "engagement_score": engagement_score, | |
| "quality": playlist_quality, | |
| "sentiment": overall_sentiment, | |
| "positive_comments": positive_comments, | |
| "negative_comments": negative_comments | |
| } | |
| except Exception as e: | |
| logging.error(f"حدث خطأ في الفيديو {video_url}: {e}") | |
| return None | |
| async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str, max_comments_per_video: int = 50, max_workers: int = 3): | |
| """ | |
| تقييم قائمة تشغيل يوتيوب باستخدام نظام مركب (نسخة 2) | |
| """ | |
| if loaded_quality_model is None or loaded_sentiment_pipeline is None: | |
| logging.error("لم يتم تحميل النماذج المطلوبة.") | |
| return {"error": "لم يتم تحميل النماذج المطلوبة."} | |
| video_links = [] | |
| try: | |
| # خيار cookiefile لـ yt-dlp يبقى هنا | |
| with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True, 'playlist_items': '1:10', 'cookiefile': COOKIES_FILE_PATH}) as ydl: | |
| playlist_info = ydl.extract_info(youtube_url, download=False) | |
| if 'entries' in playlist_info: | |
| for entry in playlist_info['entries'][:10]: | |
| if entry and 'url' in entry: | |
| video_links.append(entry['url']) | |
| else: | |
| logging.warning("لا توجد فيديوهات في هذه القائمة.") | |
| return {"error": "لا توجد فيديوهات في هذه القائمة."} | |
| except Exception as e: | |
| logging.error(f"فشل في جلب روابط الفيديو: {e}") | |
| return {"error": f"فشل في جلب روابط الفيديو: {e}"} | |
| individual_results = [] | |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| futures = [ | |
| executor.submit( | |
| process_single_video2, | |
| video_url, | |
| loaded_quality_model, | |
| loaded_sentiment_pipeline, | |
| max_comments_per_video | |
| ) for video_url in video_links | |
| ] | |
| for future in futures: | |
| result = future.result() | |
| if result: | |
| individual_results.append(result) | |
| time.sleep(random.uniform(0.5, 1.5)) | |
| num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد') | |
| total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r) | |
| total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r) | |
| total_classified_comments = total_positive_comments + total_negative_comments | |
| total_videos = len(individual_results) | |
| percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0 | |
| if percent_good_videos >= 70: | |
| overall_quality = "جيد جداً" | |
| elif percent_good_videos >= 50: | |
| overall_quality = "جيد" | |
| else: | |
| overall_quality = "سيء" | |
| WEIGHT_QUALITY = 0.6 | |
| WEIGHT_SENTIMENT = 0.4 | |
| positive_ratio = (total_positive_comments / total_classified_comments) * 100 if total_classified_comments > 0 else 0.0 | |
| composite_score = (WEIGHT_QUALITY * percent_good_videos) + (WEIGHT_SENTIMENT * positive_ratio) | |
| if composite_score >= 75: | |
| composite_quality = "ممتاز" | |
| elif composite_score >= 60: | |
| composite_quality = "جيد" | |
| else: | |
| composite_quality = "ضعيف" | |
| return { | |
| "overall_quality": overall_quality, | |
| "composite_quality": composite_quality, | |
| "composite_score": round(composite_score, 1), | |
| "percent_good_videos": round(percent_good_videos, 1), | |
| "positive_ratio": round(positive_ratio, 1), | |
| "negative_ratio": round(100 - positive_ratio, 1), | |
| } |