File size: 11,740 Bytes
5ea3020
 
 
 
 
 
 
 
 
 
 
 
 
 
32fa772
 
 
 
1477024
 
 
6b2fceb
5ea3020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0f401d
5ea3020
 
 
 
 
 
 
 
e0f401d
5ea3020
 
 
 
 
 
 
6b2fceb
5ea3020
 
 
 
 
 
 
 
 
 
 
 
2ef501e
6b2fceb
2ef501e
5ea3020
 
 
 
 
 
 
 
2ef501e
6b2fceb
5ea3020
 
 
 
 
 
 
 
 
 
 
 
 
 
32fa772
5ea3020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b2fceb
 
5ea3020
 
 
 
 
 
 
6b2fceb
32fa772
5ea3020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f26048
5ea3020
6b2fceb
32fa772
5ea3020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f26048
 
 
5ea3020
 
 
 
5f26048
5ea3020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32fa772
5f26048
5ea3020
 
 
 
 
 
 
 
 
 
6b2fceb
 
5ea3020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ef501e
5ea3020
 
 
2ef501e
5ea3020
32fa772
5f26048
 
5ea3020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d36dd91
5ea3020
 
 
d36dd91
5ea3020
 
abeaf31
5ea3020
abeaf31
 
5ea3020
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pickle
import pandas as pd
import numpy as np
import yt_dlp
from youtube_comment_downloader import YoutubeCommentDownloader
from datetime import datetime
import re
import string
import nltk
import emoji
from urllib.parse import urlparse, parse_qs
from nltk.corpus import stopwords
import logging
from concurrent.futures import ThreadPoolExecutor
from fake_useragent import UserAgent
import random
import io
import base64
import math
import time

# --- إعدادات NLTK و Logging ---
nltk.data.path.append('/app/nltk_data')
try:
    nltk.download('stopwords', quiet=True)
except Exception as e:
    logging.error(f"Failed to download NLTK stopwords: {e}")

arabic_stopwords = set(stopwords.words('arabic'))

# --- إعداد التسجيل ---
logging.basicConfig(
    filename='youtube_scraper.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

app = FastAPI()

# تحميل النماذج
loaded_quality_model = None
loaded_sentiment_pipeline = None

try:
    with open('models/final_youtube_quality_model.pkl', 'rb') as f:
        loaded_quality_model = pickle.load(f)
    logging.info("تم تحميل نموذج جودة الفيديو بنجاح.")
except FileNotFoundError:
    logging.error("خطأ: لم يتم العثور على ملف النموذج 'final_youtube_quality_model.pkl'.")
except Exception as e:
    logging.error(f"خطأ غير متوقع أثناء تحميل نموذج جودة الفيديو: {e}")

try:
    with open('models/best_sentiment_pipeline.pkl', 'rb') as f:
        loaded_sentiment_pipeline = pickle.load(f)
    logging.info("تم تحميل نموذج تصنيف المشاعر بنجاح.")
except FileNotFoundError:
    logging.error("خطأ: لم يتم العثور على ملف النموذج 'best_sentiment_pipeline.pkl'.")
except Exception as e:
    logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}")

# --- إعدادات User-Agent لـ yt-dlp ---
ua = UserAgent()

def get_desktop_user_agent():
    while True:
        candidate = random.choice([ua.chrome, ua.firefox, ua.safari])
        if all(x not in candidate for x in ['Mobile', 'Android', 'iPhone', 'iPad']):
            return candidate

selected_user_agent = get_desktop_user_agent()

headers = {'User-Agent': selected_user_agent}

# تحديد مسار ملف الكوكيز داخل حاوية Docker
COOKIES_FILE_PATH = 'cookies.txt' 

ydl_opts_video_info = {
    'quiet': True,
    'skip_download': True,
    'extract_flat': True,
    'ignoreerrors': True,
    'no_warnings': True,
    'age_limit': 18,
    'force_generic_extractor': False,
    'http_headers': headers,
    'cookiefile': COOKIES_FILE_PATH # سيبقى هذا الخيار لـ yt-dlp
}

# --- النموذج ---
class PlaylistRequest(BaseModel):
    playlist_url: str

# --- دالة استخراج ID الفيديو ---
def extract_video_id(url):
    if 'youtu.be/' in url:
        return url.split('/')[-1].split('?')[0]
    elif 'watch?v=' in url:
        return parse_qs(urlparse(url).query).get('v', [None])[0]
    return None

# --- تنظيف التعليقات ---
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = emoji.demojize(text)
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text_tokens = text.split()
    filtered_text = [word for word in text_tokens if word not in arabic_stopwords]
    return ' '.join(filtered_text)


# --- معالجة فيديو واحد فقط (نسخة 2) ---
def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50):
    # تم حذف cookies_file من هنا
    downloader = YoutubeCommentDownloader() 

    video_id = extract_video_id(video_url)
    if not video_id:
        logging.warning(f"رابط فيديو غير صالح: {video_url}. تم تجاهله.")
        return None

    try:
        time.sleep(random.uniform(1, 3))

        with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl:
            info_dict = ydl.extract_info(video_url, download=False)

            logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}")

            if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0:
                logging.warning(f"لا يمكن معالجة الفيديو {video_id}: مباشر أو مقيد عمرًا. تم تجاهله.")
                return None

            views = info_dict.get('view_count', 0)
            likes = info_dict.get('like_count', 0)

            logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}")

            upload_date = info_dict.get('upload_date', 'Unknown')
            publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year

        time.sleep(random.uniform(1, 3))
        
        sampled_comments = []
        try:
            for comment in downloader.get_comments_from_url(video_url):
                if 'text' in comment:
                    sampled_comments.append(comment['text'])
                    if len(sampled_comments) >= max_comments_per_video:
                        break
        except Exception as e:
            logging.warning(f"فشل في جلب التعليقات للفيديو {video_id}. السبب: {e}.")
            sampled_comments = []

        like_view_ratio = likes / views if views > 0 else 0.0
        comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0
        engagement_score = like_view_ratio + comment_view_ratio

        positive_comments = 0
        negative_comments = 0
        overall_sentiment = "لا توجد تعليقات كافية"

        if sampled_comments:
            processed_comments = [preprocess_text(c) for c in sampled_comments]
            sentiment_predictions = loaded_sentiment_pipeline.predict(processed_comments)
            positive_comments = np.sum(sentiment_predictions == 1)
            negative_comments = np.sum(sentiment_predictions == 0)

            if positive_comments > negative_comments:
                overall_sentiment = "إيجابي"
            elif negative_comments > positive_comments:
                overall_sentiment = "سلبي"
            else:
                overall_sentiment = "محايد"

        input_df = pd.DataFrame([[views, likes, len(sampled_comments), 0, publish_year,
                                  like_view_ratio, comment_view_ratio, engagement_score]],
                                 columns=['views_count', 'likes_count', 'comments_count',
                                          'video_duration_seconds', 'publish_year',
                                          'like_view_ratio', 'comment_view_ratio', 'engagement_score'])

        playlist_quality = "لم يتم التقييم"
        try:
            prediction_numeric = loaded_quality_model.predict(input_df)[0]
            logging.info(f"[فيديو: {video_url}] نتيجة التنبؤ: {prediction_numeric}")
            playlist_quality = "جيد" if prediction_numeric == 1 else "سيء"
        except Exception as e:
            playlist_quality = f"خطأ في التقييم: {e}"
            logging.error(f"[فيديو: {video_url}] خطأ في تقييم الفيديو: {e}")

        return {
            "video_url": video_url,
            "views": views,
            "likes": likes,
            "comments": len(sampled_comments),
            "like_view_ratio": like_view_ratio,
            "comment_view_ratio": comment_view_ratio,
            "engagement_score": engagement_score,
            "quality": playlist_quality,
            "sentiment": overall_sentiment,
            "positive_comments": positive_comments,
            "negative_comments": negative_comments
        }

    except Exception as e:
        logging.error(f"حدث خطأ في الفيديو {video_url}: {e}")
        return None

@app.post("/evaluate_youtube_playlist_individually_same_method2/")
async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str, max_comments_per_video: int = 50, max_workers: int = 3):
    """
    تقييم قائمة تشغيل يوتيوب باستخدام نظام مركب (نسخة 2)
    """

    if loaded_quality_model is None or loaded_sentiment_pipeline is None:
        logging.error("لم يتم تحميل النماذج المطلوبة.")
        return {"error": "لم يتم تحميل النماذج المطلوبة."}

    video_links = []
    try:
        # خيار cookiefile لـ yt-dlp يبقى هنا
        with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True, 'playlist_items': '1:10', 'cookiefile': COOKIES_FILE_PATH}) as ydl:
            playlist_info = ydl.extract_info(youtube_url, download=False)
            if 'entries' in playlist_info:
                for entry in playlist_info['entries'][:10]:
                    if entry and 'url' in entry:
                        video_links.append(entry['url'])
            else:
                logging.warning("لا توجد فيديوهات في هذه القائمة.")
                return {"error": "لا توجد فيديوهات في هذه القائمة."}
    except Exception as e:
        logging.error(f"فشل في جلب روابط الفيديو: {e}")
        return {"error": f"فشل في جلب روابط الفيديو: {e}"}

    individual_results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(
                process_single_video2,
                video_url,
                loaded_quality_model,
                loaded_sentiment_pipeline,
                max_comments_per_video
            ) for video_url in video_links
        ]

        for future in futures:
            result = future.result()
            if result:
                individual_results.append(result)
            time.sleep(random.uniform(0.5, 1.5))

    num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد')
    total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r)
    total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r)
    total_classified_comments = total_positive_comments + total_negative_comments

    total_videos = len(individual_results)
    percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0

    if percent_good_videos >= 70:
        overall_quality = "جيد جداً"
    elif percent_good_videos >= 50:
        overall_quality = "جيد"
    else:
        overall_quality = "سيء"

    WEIGHT_QUALITY = 0.6
    WEIGHT_SENTIMENT = 0.4

    positive_ratio = (total_positive_comments / total_classified_comments) * 100 if total_classified_comments > 0 else 0.0
    composite_score = (WEIGHT_QUALITY * percent_good_videos) + (WEIGHT_SENTIMENT * positive_ratio)

    if composite_score >= 75:
        composite_quality = "ممتاز"
    elif composite_score >= 60:
        composite_quality = "جيد"
    else:
        composite_quality = "ضعيف"

    return {
        "overall_quality": overall_quality,
        "composite_quality": composite_quality,
        "composite_score": round(composite_score, 1),
        "percent_good_videos": round(percent_good_videos, 1),
        "positive_ratio": round(positive_ratio, 1),
        "negative_ratio": round(100 - positive_ratio, 1),
    }