Spaces:
Sleeping
Sleeping
File size: 11,740 Bytes
5ea3020 32fa772 1477024 6b2fceb 5ea3020 e0f401d 5ea3020 e0f401d 5ea3020 6b2fceb 5ea3020 2ef501e 6b2fceb 2ef501e 5ea3020 2ef501e 6b2fceb 5ea3020 32fa772 5ea3020 6b2fceb 5ea3020 6b2fceb 32fa772 5ea3020 5f26048 5ea3020 6b2fceb 32fa772 5ea3020 5f26048 5ea3020 5f26048 5ea3020 32fa772 5f26048 5ea3020 6b2fceb 5ea3020 2ef501e 5ea3020 2ef501e 5ea3020 32fa772 5f26048 5ea3020 d36dd91 5ea3020 d36dd91 5ea3020 abeaf31 5ea3020 abeaf31 5ea3020 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pickle
import pandas as pd
import numpy as np
import yt_dlp
from youtube_comment_downloader import YoutubeCommentDownloader
from datetime import datetime
import re
import string
import nltk
import emoji
from urllib.parse import urlparse, parse_qs
from nltk.corpus import stopwords
import logging
from concurrent.futures import ThreadPoolExecutor
from fake_useragent import UserAgent
import random
import io
import base64
import math
import time
# --- إعدادات NLTK و Logging ---
nltk.data.path.append('/app/nltk_data')
try:
nltk.download('stopwords', quiet=True)
except Exception as e:
logging.error(f"Failed to download NLTK stopwords: {e}")
arabic_stopwords = set(stopwords.words('arabic'))
# --- إعداد التسجيل ---
logging.basicConfig(
filename='youtube_scraper.log',
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
app = FastAPI()
# تحميل النماذج
loaded_quality_model = None
loaded_sentiment_pipeline = None
try:
with open('models/final_youtube_quality_model.pkl', 'rb') as f:
loaded_quality_model = pickle.load(f)
logging.info("تم تحميل نموذج جودة الفيديو بنجاح.")
except FileNotFoundError:
logging.error("خطأ: لم يتم العثور على ملف النموذج 'final_youtube_quality_model.pkl'.")
except Exception as e:
logging.error(f"خطأ غير متوقع أثناء تحميل نموذج جودة الفيديو: {e}")
try:
with open('models/best_sentiment_pipeline.pkl', 'rb') as f:
loaded_sentiment_pipeline = pickle.load(f)
logging.info("تم تحميل نموذج تصنيف المشاعر بنجاح.")
except FileNotFoundError:
logging.error("خطأ: لم يتم العثور على ملف النموذج 'best_sentiment_pipeline.pkl'.")
except Exception as e:
logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}")
# --- إعدادات User-Agent لـ yt-dlp ---
ua = UserAgent()
def get_desktop_user_agent():
while True:
candidate = random.choice([ua.chrome, ua.firefox, ua.safari])
if all(x not in candidate for x in ['Mobile', 'Android', 'iPhone', 'iPad']):
return candidate
selected_user_agent = get_desktop_user_agent()
headers = {'User-Agent': selected_user_agent}
# تحديد مسار ملف الكوكيز داخل حاوية Docker
COOKIES_FILE_PATH = 'cookies.txt'
ydl_opts_video_info = {
'quiet': True,
'skip_download': True,
'extract_flat': True,
'ignoreerrors': True,
'no_warnings': True,
'age_limit': 18,
'force_generic_extractor': False,
'http_headers': headers,
'cookiefile': COOKIES_FILE_PATH # سيبقى هذا الخيار لـ yt-dlp
}
# --- النموذج ---
class PlaylistRequest(BaseModel):
playlist_url: str
# --- دالة استخراج ID الفيديو ---
def extract_video_id(url):
if 'youtu.be/' in url:
return url.split('/')[-1].split('?')[0]
elif 'watch?v=' in url:
return parse_qs(urlparse(url).query).get('v', [None])[0]
return None
# --- تنظيف التعليقات ---
def preprocess_text(text):
if not isinstance(text, str):
return ""
text = emoji.demojize(text)
text = re.sub(r'http\S+', '', text)
text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
text = text.lower()
text = re.sub(r'\s+', ' ', text).strip()
text_tokens = text.split()
filtered_text = [word for word in text_tokens if word not in arabic_stopwords]
return ' '.join(filtered_text)
# --- معالجة فيديو واحد فقط (نسخة 2) ---
def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50):
# تم حذف cookies_file من هنا
downloader = YoutubeCommentDownloader()
video_id = extract_video_id(video_url)
if not video_id:
logging.warning(f"رابط فيديو غير صالح: {video_url}. تم تجاهله.")
return None
try:
time.sleep(random.uniform(1, 3))
with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl:
info_dict = ydl.extract_info(video_url, download=False)
logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}")
if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0:
logging.warning(f"لا يمكن معالجة الفيديو {video_id}: مباشر أو مقيد عمرًا. تم تجاهله.")
return None
views = info_dict.get('view_count', 0)
likes = info_dict.get('like_count', 0)
logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}")
upload_date = info_dict.get('upload_date', 'Unknown')
publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year
time.sleep(random.uniform(1, 3))
sampled_comments = []
try:
for comment in downloader.get_comments_from_url(video_url):
if 'text' in comment:
sampled_comments.append(comment['text'])
if len(sampled_comments) >= max_comments_per_video:
break
except Exception as e:
logging.warning(f"فشل في جلب التعليقات للفيديو {video_id}. السبب: {e}.")
sampled_comments = []
like_view_ratio = likes / views if views > 0 else 0.0
comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0
engagement_score = like_view_ratio + comment_view_ratio
positive_comments = 0
negative_comments = 0
overall_sentiment = "لا توجد تعليقات كافية"
if sampled_comments:
processed_comments = [preprocess_text(c) for c in sampled_comments]
sentiment_predictions = loaded_sentiment_pipeline.predict(processed_comments)
positive_comments = np.sum(sentiment_predictions == 1)
negative_comments = np.sum(sentiment_predictions == 0)
if positive_comments > negative_comments:
overall_sentiment = "إيجابي"
elif negative_comments > positive_comments:
overall_sentiment = "سلبي"
else:
overall_sentiment = "محايد"
input_df = pd.DataFrame([[views, likes, len(sampled_comments), 0, publish_year,
like_view_ratio, comment_view_ratio, engagement_score]],
columns=['views_count', 'likes_count', 'comments_count',
'video_duration_seconds', 'publish_year',
'like_view_ratio', 'comment_view_ratio', 'engagement_score'])
playlist_quality = "لم يتم التقييم"
try:
prediction_numeric = loaded_quality_model.predict(input_df)[0]
logging.info(f"[فيديو: {video_url}] نتيجة التنبؤ: {prediction_numeric}")
playlist_quality = "جيد" if prediction_numeric == 1 else "سيء"
except Exception as e:
playlist_quality = f"خطأ في التقييم: {e}"
logging.error(f"[فيديو: {video_url}] خطأ في تقييم الفيديو: {e}")
return {
"video_url": video_url,
"views": views,
"likes": likes,
"comments": len(sampled_comments),
"like_view_ratio": like_view_ratio,
"comment_view_ratio": comment_view_ratio,
"engagement_score": engagement_score,
"quality": playlist_quality,
"sentiment": overall_sentiment,
"positive_comments": positive_comments,
"negative_comments": negative_comments
}
except Exception as e:
logging.error(f"حدث خطأ في الفيديو {video_url}: {e}")
return None
@app.post("/evaluate_youtube_playlist_individually_same_method2/")
async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str, max_comments_per_video: int = 50, max_workers: int = 3):
"""
تقييم قائمة تشغيل يوتيوب باستخدام نظام مركب (نسخة 2)
"""
if loaded_quality_model is None or loaded_sentiment_pipeline is None:
logging.error("لم يتم تحميل النماذج المطلوبة.")
return {"error": "لم يتم تحميل النماذج المطلوبة."}
video_links = []
try:
# خيار cookiefile لـ yt-dlp يبقى هنا
with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True, 'playlist_items': '1:10', 'cookiefile': COOKIES_FILE_PATH}) as ydl:
playlist_info = ydl.extract_info(youtube_url, download=False)
if 'entries' in playlist_info:
for entry in playlist_info['entries'][:10]:
if entry and 'url' in entry:
video_links.append(entry['url'])
else:
logging.warning("لا توجد فيديوهات في هذه القائمة.")
return {"error": "لا توجد فيديوهات في هذه القائمة."}
except Exception as e:
logging.error(f"فشل في جلب روابط الفيديو: {e}")
return {"error": f"فشل في جلب روابط الفيديو: {e}"}
individual_results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(
process_single_video2,
video_url,
loaded_quality_model,
loaded_sentiment_pipeline,
max_comments_per_video
) for video_url in video_links
]
for future in futures:
result = future.result()
if result:
individual_results.append(result)
time.sleep(random.uniform(0.5, 1.5))
num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد')
total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r)
total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r)
total_classified_comments = total_positive_comments + total_negative_comments
total_videos = len(individual_results)
percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0
if percent_good_videos >= 70:
overall_quality = "جيد جداً"
elif percent_good_videos >= 50:
overall_quality = "جيد"
else:
overall_quality = "سيء"
WEIGHT_QUALITY = 0.6
WEIGHT_SENTIMENT = 0.4
positive_ratio = (total_positive_comments / total_classified_comments) * 100 if total_classified_comments > 0 else 0.0
composite_score = (WEIGHT_QUALITY * percent_good_videos) + (WEIGHT_SENTIMENT * positive_ratio)
if composite_score >= 75:
composite_quality = "ممتاز"
elif composite_score >= 60:
composite_quality = "جيد"
else:
composite_quality = "ضعيف"
return {
"overall_quality": overall_quality,
"composite_quality": composite_quality,
"composite_score": round(composite_score, 1),
"percent_good_videos": round(percent_good_videos, 1),
"positive_ratio": round(positive_ratio, 1),
"negative_ratio": round(100 - positive_ratio, 1),
} |