|
|
from fastapi import FastAPI, HTTPException |
|
|
from pydantic import BaseModel |
|
|
import pickle |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import yt_dlp |
|
|
from youtube_comment_downloader import YoutubeCommentDownloader |
|
|
from datetime import datetime |
|
|
import re |
|
|
import string |
|
|
import nltk |
|
|
import emoji |
|
|
from urllib.parse import urlparse, parse_qs |
|
|
from nltk.corpus import stopwords |
|
|
import logging |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
from fake_useragent import UserAgent |
|
|
import random |
|
|
import io |
|
|
import base64 |
|
|
import math |
|
|
import time |
|
|
|
|
|
|
|
|
nltk.data.path.append('/app/nltk_data') |
|
|
try: |
|
|
nltk.download('stopwords', quiet=True) |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to download NLTK stopwords: {e}") |
|
|
|
|
|
arabic_stopwords = set(stopwords.words('arabic')) |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
filename='youtube_scraper.log', |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
|
) |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
|
|
|
loaded_quality_model = None |
|
|
loaded_sentiment_pipeline = None |
|
|
|
|
|
try: |
|
|
with open('models/final_youtube_quality_model.pkl', 'rb') as f: |
|
|
loaded_quality_model = pickle.load(f) |
|
|
logging.info("تم تحميل نموذج جودة الفيديو بنجاح.") |
|
|
except FileNotFoundError: |
|
|
logging.error("خطأ: لم يتم العثور على ملف النموذج 'final_youtube_quality_model.pkl'.") |
|
|
except Exception as e: |
|
|
logging.error(f"خطأ غير متوقع أثناء تحميل نموذج جودة الفيديو: {e}") |
|
|
|
|
|
try: |
|
|
with open('models/best_sentiment_pipeline.pkl', 'rb') as f: |
|
|
loaded_sentiment_pipeline = pickle.load(f) |
|
|
logging.info("تم تحميل نموذج تصنيف المشاعر بنجاح.") |
|
|
except FileNotFoundError: |
|
|
logging.error("خطأ: لم يتم العثور على ملف النموذج 'best_sentiment_pipeline.pkl'.") |
|
|
except Exception as e: |
|
|
logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}") |
|
|
|
|
|
|
|
|
ua = UserAgent() |
|
|
|
|
|
def get_desktop_user_agent(): |
|
|
while True: |
|
|
candidate = random.choice([ua.chrome, ua.firefox, ua.safari]) |
|
|
if all(x not in candidate for x in ['Mobile', 'Android', 'iPhone', 'iPad']): |
|
|
return candidate |
|
|
|
|
|
selected_user_agent = get_desktop_user_agent() |
|
|
|
|
|
headers = {'User-Agent': selected_user_agent} |
|
|
|
|
|
|
|
|
COOKIES_FILE_PATH = 'cookies.txt' |
|
|
|
|
|
ydl_opts_video_info = { |
|
|
'quiet': True, |
|
|
'skip_download': True, |
|
|
'extract_flat': True, |
|
|
'ignoreerrors': True, |
|
|
'no_warnings': True, |
|
|
'age_limit': 18, |
|
|
'force_generic_extractor': False, |
|
|
'http_headers': headers, |
|
|
'cookiefile': COOKIES_FILE_PATH |
|
|
} |
|
|
|
|
|
|
|
|
class PlaylistRequest(BaseModel): |
|
|
playlist_url: str |
|
|
|
|
|
|
|
|
def extract_video_id(url): |
|
|
if 'youtu.be/' in url: |
|
|
return url.split('/')[-1].split('?')[0] |
|
|
elif 'watch?v=' in url: |
|
|
return parse_qs(urlparse(url).query).get('v', [None])[0] |
|
|
return None |
|
|
|
|
|
|
|
|
def preprocess_text(text): |
|
|
if not isinstance(text, str): |
|
|
return "" |
|
|
text = emoji.demojize(text) |
|
|
text = re.sub(r'http\S+', '', text) |
|
|
text = text.translate(str.maketrans('', '', string.punctuation + string.digits)) |
|
|
text = text.lower() |
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
text_tokens = text.split() |
|
|
filtered_text = [word for word in text_tokens if word not in arabic_stopwords] |
|
|
return ' '.join(filtered_text) |
|
|
|
|
|
|
|
|
|
|
|
def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50): |
|
|
|
|
|
downloader = YoutubeCommentDownloader() |
|
|
|
|
|
video_id = extract_video_id(video_url) |
|
|
if not video_id: |
|
|
logging.warning(f"رابط فيديو غير صالح: {video_url}. تم تجاهله.") |
|
|
return None |
|
|
|
|
|
try: |
|
|
time.sleep(random.uniform(1, 3)) |
|
|
|
|
|
with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl: |
|
|
info_dict = ydl.extract_info(video_url, download=False) |
|
|
|
|
|
logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}") |
|
|
|
|
|
if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0: |
|
|
logging.warning(f"لا يمكن معالجة الفيديو {video_id}: مباشر أو مقيد عمرًا. تم تجاهله.") |
|
|
return None |
|
|
|
|
|
views = info_dict.get('view_count', 0) |
|
|
likes = info_dict.get('like_count', 0) |
|
|
|
|
|
logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}") |
|
|
|
|
|
upload_date = info_dict.get('upload_date', 'Unknown') |
|
|
publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year |
|
|
|
|
|
time.sleep(random.uniform(1, 3)) |
|
|
|
|
|
sampled_comments = [] |
|
|
try: |
|
|
for comment in downloader.get_comments_from_url(video_url): |
|
|
if 'text' in comment: |
|
|
sampled_comments.append(comment['text']) |
|
|
if len(sampled_comments) >= max_comments_per_video: |
|
|
break |
|
|
except Exception as e: |
|
|
logging.warning(f"فشل في جلب التعليقات للفيديو {video_id}. السبب: {e}.") |
|
|
sampled_comments = [] |
|
|
|
|
|
like_view_ratio = likes / views if views > 0 else 0.0 |
|
|
comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0 |
|
|
engagement_score = like_view_ratio + comment_view_ratio |
|
|
|
|
|
positive_comments = 0 |
|
|
negative_comments = 0 |
|
|
overall_sentiment = "لا توجد تعليقات كافية" |
|
|
|
|
|
if sampled_comments: |
|
|
processed_comments = [preprocess_text(c) for c in sampled_comments] |
|
|
sentiment_predictions = loaded_sentiment_pipeline.predict(processed_comments) |
|
|
positive_comments = np.sum(sentiment_predictions == 1) |
|
|
negative_comments = np.sum(sentiment_predictions == 0) |
|
|
|
|
|
if positive_comments > negative_comments: |
|
|
overall_sentiment = "إيجابي" |
|
|
elif negative_comments > positive_comments: |
|
|
overall_sentiment = "سلبي" |
|
|
else: |
|
|
overall_sentiment = "محايد" |
|
|
|
|
|
input_df = pd.DataFrame([[views, likes, len(sampled_comments), 0, publish_year, |
|
|
like_view_ratio, comment_view_ratio, engagement_score]], |
|
|
columns=['views_count', 'likes_count', 'comments_count', |
|
|
'video_duration_seconds', 'publish_year', |
|
|
'like_view_ratio', 'comment_view_ratio', 'engagement_score']) |
|
|
|
|
|
playlist_quality = "لم يتم التقييم" |
|
|
try: |
|
|
prediction_numeric = loaded_quality_model.predict(input_df)[0] |
|
|
logging.info(f"[فيديو: {video_url}] نتيجة التنبؤ: {prediction_numeric}") |
|
|
playlist_quality = "جيد" if prediction_numeric == 1 else "سيء" |
|
|
except Exception as e: |
|
|
playlist_quality = f"خطأ في التقييم: {e}" |
|
|
logging.error(f"[فيديو: {video_url}] خطأ في تقييم الفيديو: {e}") |
|
|
|
|
|
return { |
|
|
"video_url": video_url, |
|
|
"views": views, |
|
|
"likes": likes, |
|
|
"comments": len(sampled_comments), |
|
|
"like_view_ratio": like_view_ratio, |
|
|
"comment_view_ratio": comment_view_ratio, |
|
|
"engagement_score": engagement_score, |
|
|
"quality": playlist_quality, |
|
|
"sentiment": overall_sentiment, |
|
|
"positive_comments": positive_comments, |
|
|
"negative_comments": negative_comments |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"حدث خطأ في الفيديو {video_url}: {e}") |
|
|
return None |
|
|
|
|
|
@app.post("/evaluate_youtube_playlist_individually_same_method2/") |
|
|
async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str, max_comments_per_video: int = 50, max_workers: int = 3): |
|
|
""" |
|
|
تقييم قائمة تشغيل يوتيوب باستخدام نظام مركب (نسخة 2) |
|
|
""" |
|
|
|
|
|
if loaded_quality_model is None or loaded_sentiment_pipeline is None: |
|
|
logging.error("لم يتم تحميل النماذج المطلوبة.") |
|
|
return {"error": "لم يتم تحميل النماذج المطلوبة."} |
|
|
|
|
|
video_links = [] |
|
|
try: |
|
|
|
|
|
with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True, 'playlist_items': '1:10', 'cookiefile': COOKIES_FILE_PATH}) as ydl: |
|
|
playlist_info = ydl.extract_info(youtube_url, download=False) |
|
|
if 'entries' in playlist_info: |
|
|
for entry in playlist_info['entries'][:10]: |
|
|
if entry and 'url' in entry: |
|
|
video_links.append(entry['url']) |
|
|
else: |
|
|
logging.warning("لا توجد فيديوهات في هذه القائمة.") |
|
|
return {"error": "لا توجد فيديوهات في هذه القائمة."} |
|
|
except Exception as e: |
|
|
logging.error(f"فشل في جلب روابط الفيديو: {e}") |
|
|
return {"error": f"فشل في جلب روابط الفيديو: {e}"} |
|
|
|
|
|
individual_results = [] |
|
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor: |
|
|
futures = [ |
|
|
executor.submit( |
|
|
process_single_video2, |
|
|
video_url, |
|
|
loaded_quality_model, |
|
|
loaded_sentiment_pipeline, |
|
|
max_comments_per_video |
|
|
) for video_url in video_links |
|
|
] |
|
|
|
|
|
for future in futures: |
|
|
result = future.result() |
|
|
if result: |
|
|
individual_results.append(result) |
|
|
time.sleep(random.uniform(0.5, 1.5)) |
|
|
|
|
|
num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد') |
|
|
total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r) |
|
|
total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r) |
|
|
total_classified_comments = total_positive_comments + total_negative_comments |
|
|
|
|
|
total_videos = len(individual_results) |
|
|
percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0 |
|
|
|
|
|
if percent_good_videos >= 70: |
|
|
overall_quality = "جيد جداً" |
|
|
elif percent_good_videos >= 50: |
|
|
overall_quality = "جيد" |
|
|
else: |
|
|
overall_quality = "سيء" |
|
|
|
|
|
WEIGHT_QUALITY = 0.6 |
|
|
WEIGHT_SENTIMENT = 0.4 |
|
|
|
|
|
positive_ratio = (total_positive_comments / total_classified_comments) * 100 if total_classified_comments > 0 else 0.0 |
|
|
composite_score = (WEIGHT_QUALITY * percent_good_videos) + (WEIGHT_SENTIMENT * positive_ratio) |
|
|
|
|
|
if composite_score >= 75: |
|
|
composite_quality = "ممتاز" |
|
|
elif composite_score >= 60: |
|
|
composite_quality = "جيد" |
|
|
else: |
|
|
composite_quality = "ضعيف" |
|
|
|
|
|
return { |
|
|
"overall_quality": overall_quality, |
|
|
"composite_quality": composite_quality, |
|
|
"composite_score": round(composite_score, 1), |
|
|
"percent_good_videos": round(percent_good_videos, 1), |
|
|
"positive_ratio": round(positive_ratio, 1), |
|
|
"negative_ratio": round(100 - positive_ratio, 1), |
|
|
} |