Spaces:

mohammed777
/

youtube-analyzer-api

Sleeping

File size: 11,740 Bytes

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pickle
import pandas as pd
import numpy as np
import yt_dlp
from youtube_comment_downloader import YoutubeCommentDownloader
from datetime import datetime
import re
import string
import nltk
import emoji
from urllib.parse import urlparse, parse_qs
from nltk.corpus import stopwords
import logging
from concurrent.futures import ThreadPoolExecutor
from fake_useragent import UserAgent
import random
import io
import base64
import math
import time

# --- إعدادات NLTK و Logging ---
nltk.data.path.append('/app/nltk_data')
try:
    nltk.download('stopwords', quiet=True)
except Exception as e:
    logging.error(f"Failed to download NLTK stopwords: {e}")

arabic_stopwords = set(stopwords.words('arabic'))

# --- إعداد التسجيل ---
logging.basicConfig(
    filename='youtube_scraper.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

app = FastAPI()

# تحميل النماذج
loaded_quality_model = None
loaded_sentiment_pipeline = None

try:
    with open('models/final_youtube_quality_model.pkl', 'rb') as f:
        loaded_quality_model = pickle.load(f)
    logging.info("تم تحميل نموذج جودة الفيديو بنجاح.")
except FileNotFoundError:
    logging.error("خطأ: لم يتم العثور على ملف النموذج 'final_youtube_quality_model.pkl'.")
except Exception as e:
    logging.error(f"خطأ غير متوقع أثناء تحميل نموذج جودة الفيديو: {e}")

try:
    with open('models/best_sentiment_pipeline.pkl', 'rb') as f:
        loaded_sentiment_pipeline = pickle.load(f)
    logging.info("تم تحميل نموذج تصنيف المشاعر بنجاح.")
except FileNotFoundError:
    logging.error("خطأ: لم يتم العثور على ملف النموذج 'best_sentiment_pipeline.pkl'.")
except Exception as e:
    logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}")

# --- إعدادات User-Agent لـ yt-dlp ---
ua = UserAgent()

def get_desktop_user_agent():
    while True:
        candidate = random.choice([ua.chrome, ua.firefox, ua.safari])
        if all(x not in candidate for x in ['Mobile', 'Android', 'iPhone', 'iPad']):
            return candidate

selected_user_agent = get_desktop_user_agent()

headers = {'User-Agent': selected_user_agent}

# تحديد مسار ملف الكوكيز داخل حاوية Docker
COOKIES_FILE_PATH = 'cookies.txt' 

ydl_opts_video_info = {
    'quiet': True,
    'skip_download': True,
    'extract_flat': True,
    'ignoreerrors': True,
    'no_warnings': True,
    'age_limit': 18,
    'force_generic_extractor': False,
    'http_headers': headers,
    'cookiefile': COOKIES_FILE_PATH # سيبقى هذا الخيار لـ yt-dlp
}

# --- النموذج ---
class PlaylistRequest(BaseModel):
    playlist_url: str

# --- دالة استخراج ID الفيديو ---
def extract_video_id(url):
    if 'youtu.be/' in url:
        return url.split('/')[-1].split('?')[0]
    elif 'watch?v=' in url:
        return parse_qs(urlparse(url).query).get('v', [None])[0]
    return None

# --- تنظيف التعليقات ---
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = emoji.demojize(text)
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text_tokens = text.split()
    filtered_text = [word for word in text_tokens if word not in arabic_stopwords]
    return ' '.join(filtered_text)


# --- معالجة فيديو واحد فقط (نسخة 2) ---
def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50):
    # تم حذف cookies_file من هنا
    downloader = YoutubeCommentDownloader() 

    video_id = extract_video_id(video_url)
    if not video_id:
        logging.warning(f"رابط فيديو غير صالح: {video_url}. تم تجاهله.")
        return None

    try:
        time.sleep(random.uniform(1, 3))

        with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl:
            info_dict = ydl.extract_info(video_url, download=False)

            logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}")

            if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0:
                logging.warning(f"لا يمكن معالجة الفيديو {video_id}: مباشر أو مقيد عمرًا. تم تجاهله.")
                return None

            views = info_dict.get('view_count', 0)
            likes = info_dict.get('like_count', 0)

            logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}")

            upload_date = info_dict.get('upload_date', 'Unknown')
            publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year

        time.sleep(random.uniform(1, 3))
        
        sampled_comments = []
        try:
            for comment in downloader.get_comments_from_url(video_url):
                if 'text' in comment:
                    sampled_comments.append(comment['text'])
                    if len(sampled_comments) >= max_comments_per_video:
                        break
        except Exception as e:
            logging.warning(f"فشل في جلب التعليقات للفيديو {video_id}. السبب: {e}.")
            sampled_comments = []

        like_view_ratio = likes / views if views > 0 else 0.0
        comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0
        engagement_score = like_view_ratio + comment_view_ratio

        positive_comments = 0
        negative_comments = 0
        overall_sentiment = "لا توجد تعليقات كافية"

        if sampled_comments:
            processed_comments = [preprocess_text(c) for c in sampled_comments]
            sentiment_predictions = loaded_sentiment_pipeline.predict(processed_comments)
            positive_comments = np.sum(sentiment_predictions == 1)
            negative_comments = np.sum(sentiment_predictions == 0)

            if positive_comments > negative_comments:
                overall_sentiment = "إيجابي"
            elif negative_comments > positive_comments:
                overall_sentiment = "سلبي"
            else:
                overall_sentiment = "محايد"

        input_df = pd.DataFrame([[views, likes, len(sampled_comments), 0, publish_year,
                                  like_view_ratio, comment_view_ratio, engagement_score]],
                                 columns=['views_count', 'likes_count', 'comments_count',
                                          'video_duration_seconds', 'publish_year',
                                          'like_view_ratio', 'comment_view_ratio', 'engagement_score'])

        playlist_quality = "لم يتم التقييم"
        try:
            prediction_numeric = loaded_quality_model.predict(input_df)[0]
            logging.info(f"[فيديو: {video_url}] نتيجة التنبؤ: {prediction_numeric}")
            playlist_quality = "جيد" if prediction_numeric == 1 else "سيء"
        except Exception as e:
            playlist_quality = f"خطأ في التقييم: {e}"
            logging.error(f"[فيديو: {video_url}] خطأ في تقييم الفيديو: {e}")

        return {
            "video_url": video_url,
            "views": views,
            "likes": likes,
            "comments": len(sampled_comments),
            "like_view_ratio": like_view_ratio,
            "comment_view_ratio": comment_view_ratio,
            "engagement_score": engagement_score,
            "quality": playlist_quality,
            "sentiment": overall_sentiment,
            "positive_comments": positive_comments,
            "negative_comments": negative_comments
        }

    except Exception as e:
        logging.error(f"حدث خطأ في الفيديو {video_url}: {e}")
        return None

@app.post("/evaluate_youtube_playlist_individually_same_method2/")
async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str, max_comments_per_video: int = 50, max_workers: int = 3):
    """
    تقييم قائمة تشغيل يوتيوب باستخدام نظام مركب (نسخة 2)
    """

    if loaded_quality_model is None or loaded_sentiment_pipeline is None:
        logging.error("لم يتم تحميل النماذج المطلوبة.")
        return {"error": "لم يتم تحميل النماذج المطلوبة."}

    video_links = []
    try:
        # خيار cookiefile لـ yt-dlp يبقى هنا
        with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True, 'playlist_items': '1:10', 'cookiefile': COOKIES_FILE_PATH}) as ydl:
            playlist_info = ydl.extract_info(youtube_url, download=False)
            if 'entries' in playlist_info:
                for entry in playlist_info['entries'][:10]:
                    if entry and 'url' in entry:
                        video_links.append(entry['url'])
            else:
                logging.warning("لا توجد فيديوهات في هذه القائمة.")
                return {"error": "لا توجد فيديوهات في هذه القائمة."}
    except Exception as e:
        logging.error(f"فشل في جلب روابط الفيديو: {e}")
        return {"error": f"فشل في جلب روابط الفيديو: {e}"}

    individual_results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(
                process_single_video2,
                video_url,
                loaded_quality_model,
                loaded_sentiment_pipeline,
                max_comments_per_video
            ) for video_url in video_links
        ]

        for future in futures:
            result = future.result()
            if result:
                individual_results.append(result)
            time.sleep(random.uniform(0.5, 1.5))

    num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد')
    total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r)
    total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r)
    total_classified_comments = total_positive_comments + total_negative_comments

    total_videos = len(individual_results)
    percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0

    if percent_good_videos >= 70:
        overall_quality = "جيد جداً"
    elif percent_good_videos >= 50:
        overall_quality = "جيد"
    else:
        overall_quality = "سيء"

    WEIGHT_QUALITY = 0.6
    WEIGHT_SENTIMENT = 0.4

    positive_ratio = (total_positive_comments / total_classified_comments) * 100 if total_classified_comments > 0 else 0.0
    composite_score = (WEIGHT_QUALITY * percent_good_videos) + (WEIGHT_SENTIMENT * positive_ratio)

    if composite_score >= 75:
        composite_quality = "ممتاز"
    elif composite_score >= 60:
        composite_quality = "جيد"
    else:
        composite_quality = "ضعيف"

    return {
        "overall_quality": overall_quality,
        "composite_quality": composite_quality,
        "composite_score": round(composite_score, 1),
        "percent_good_videos": round(percent_good_videos, 1),
        "positive_ratio": round(positive_ratio, 1),
        "negative_ratio": round(100 - positive_ratio, 1),
    }