Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -19,17 +19,14 @@ import random
|
|
| 19 |
import io
|
| 20 |
import base64
|
| 21 |
import math
|
| 22 |
-
import time
|
| 23 |
|
| 24 |
# --- إعدادات NLTK و Logging ---
|
| 25 |
-
# ضبط مسار بيانات NLTK ليطابق Dockerfile
|
| 26 |
nltk.data.path.append('/app/nltk_data')
|
| 27 |
try:
|
| 28 |
-
# سيتم تنزيلها مرة واحدة في Dockerfile، ولكن هذا يضمن أنها متاحة
|
| 29 |
nltk.download('stopwords', quiet=True)
|
| 30 |
except Exception as e:
|
| 31 |
logging.error(f"Failed to download NLTK stopwords: {e}")
|
| 32 |
-
# التعامل مع الخطأ إذا لم يتمكن من التنزيل (مثلاً إذا لم يكن المسار صحيحاً أو صلاحيات)
|
| 33 |
|
| 34 |
arabic_stopwords = set(stopwords.words('arabic'))
|
| 35 |
|
|
@@ -64,7 +61,7 @@ except FileNotFoundError:
|
|
| 64 |
except Exception as e:
|
| 65 |
logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}")
|
| 66 |
|
| 67 |
-
# --- إعدادات User-Agent لـ yt-dlp
|
| 68 |
ua = UserAgent()
|
| 69 |
|
| 70 |
def get_desktop_user_agent():
|
|
@@ -78,8 +75,7 @@ selected_user_agent = get_desktop_user_agent()
|
|
| 78 |
headers = {'User-Agent': selected_user_agent}
|
| 79 |
|
| 80 |
# تحديد مسار ملف الكوكيز داخل حاوية Docker
|
| 81 |
-
|
| 82 |
-
COOKIES_FILE_PATH = 'cookies.txt' # افتراض أن cookies.txt في الجذر /app/
|
| 83 |
|
| 84 |
ydl_opts_video_info = {
|
| 85 |
'quiet': True,
|
|
@@ -90,7 +86,7 @@ ydl_opts_video_info = {
|
|
| 90 |
'age_limit': 18,
|
| 91 |
'force_generic_extractor': False,
|
| 92 |
'http_headers': headers,
|
| 93 |
-
'cookiefile': COOKIES_FILE_PATH #
|
| 94 |
}
|
| 95 |
|
| 96 |
# --- النموذج ---
|
|
@@ -121,8 +117,8 @@ def preprocess_text(text):
|
|
| 121 |
|
| 122 |
# --- معالجة فيديو واحد فقط (نسخة 2) ---
|
| 123 |
def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50):
|
| 124 |
-
#
|
| 125 |
-
downloader = YoutubeCommentDownloader(
|
| 126 |
|
| 127 |
video_id = extract_video_id(video_url)
|
| 128 |
if not video_id:
|
|
@@ -130,13 +126,11 @@ def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipe
|
|
| 130 |
return None
|
| 131 |
|
| 132 |
try:
|
| 133 |
-
|
| 134 |
-
time.sleep(random.uniform(1, 3)) # تأخير بين 1 و 3 ثوانٍ
|
| 135 |
|
| 136 |
with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl:
|
| 137 |
info_dict = ydl.extract_info(video_url, download=False)
|
| 138 |
|
| 139 |
-
# --- تسجيل البيانات المستخلصة ---
|
| 140 |
logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}")
|
| 141 |
|
| 142 |
if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0:
|
|
@@ -146,16 +140,13 @@ def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipe
|
|
| 146 |
views = info_dict.get('view_count', 0)
|
| 147 |
likes = info_dict.get('like_count', 0)
|
| 148 |
|
| 149 |
-
# --- تسجيل المشاهدات والإعجابات ---
|
| 150 |
logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}")
|
| 151 |
|
| 152 |
upload_date = info_dict.get('upload_date', 'Unknown')
|
| 153 |
publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year
|
| 154 |
|
| 155 |
-
|
| 156 |
-
time.sleep(random.uniform(1, 3)) # تأخير بين 1 و 3 ثوانٍ
|
| 157 |
|
| 158 |
-
# --- جلب التعليقات ---
|
| 159 |
sampled_comments = []
|
| 160 |
try:
|
| 161 |
for comment in downloader.get_comments_from_url(video_url):
|
|
@@ -171,7 +162,6 @@ def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipe
|
|
| 171 |
comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0
|
| 172 |
engagement_score = like_view_ratio + comment_view_ratio
|
| 173 |
|
| 174 |
-
# --- تحليل التعليقات ---
|
| 175 |
positive_comments = 0
|
| 176 |
negative_comments = 0
|
| 177 |
overall_sentiment = "لا توجد تعليقات كافية"
|
|
@@ -234,7 +224,8 @@ async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str,
|
|
| 234 |
|
| 235 |
video_links = []
|
| 236 |
try:
|
| 237 |
-
|
|
|
|
| 238 |
playlist_info = ydl.extract_info(youtube_url, download=False)
|
| 239 |
if 'entries' in playlist_info:
|
| 240 |
for entry in playlist_info['entries'][:10]:
|
|
@@ -249,7 +240,6 @@ async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str,
|
|
| 249 |
|
| 250 |
individual_results = []
|
| 251 |
|
| 252 |
-
# --- معالجة الفيديوهات بالتوازي ---
|
| 253 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 254 |
futures = [
|
| 255 |
executor.submit(
|
|
@@ -265,10 +255,8 @@ async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str,
|
|
| 265 |
result = future.result()
|
| 266 |
if result:
|
| 267 |
individual_results.append(result)
|
| 268 |
-
# تأخير إضافي بين معالجة نتائج الفيديوهات
|
| 269 |
time.sleep(random.uniform(0.5, 1.5))
|
| 270 |
|
| 271 |
-
# --- الإحصاء النهائي ---
|
| 272 |
num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد')
|
| 273 |
total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r)
|
| 274 |
total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r)
|
|
@@ -277,7 +265,6 @@ async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str,
|
|
| 277 |
total_videos = len(individual_results)
|
| 278 |
percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0
|
| 279 |
|
| 280 |
-
# --- حساب التقييم العام ---
|
| 281 |
if percent_good_videos >= 70:
|
| 282 |
overall_quality = "جيد جداً"
|
| 283 |
elif percent_good_videos >= 50:
|
|
@@ -285,7 +272,6 @@ async def evaluate_youtube_playlist_individually_same_method2(youtube_url: str,
|
|
| 285 |
else:
|
| 286 |
overall_quality = "سيء"
|
| 287 |
|
| 288 |
-
# --- النظام المركب ---
|
| 289 |
WEIGHT_QUALITY = 0.6
|
| 290 |
WEIGHT_SENTIMENT = 0.4
|
| 291 |
|
|
|
|
| 19 |
import io
|
| 20 |
import base64
|
| 21 |
import math
|
| 22 |
+
import time
|
| 23 |
|
| 24 |
# --- إعدادات NLTK و Logging ---
|
|
|
|
| 25 |
nltk.data.path.append('/app/nltk_data')
|
| 26 |
try:
|
|
|
|
| 27 |
nltk.download('stopwords', quiet=True)
|
| 28 |
except Exception as e:
|
| 29 |
logging.error(f"Failed to download NLTK stopwords: {e}")
|
|
|
|
| 30 |
|
| 31 |
arabic_stopwords = set(stopwords.words('arabic'))
|
| 32 |
|
|
|
|
| 61 |
except Exception as e:
|
| 62 |
logging.error(f"خطأ غير متوقع أثناء تحميل نموذج تحليل المشاعر: {e}")
|
| 63 |
|
| 64 |
+
# --- إعدادات User-Agent لـ yt-dlp ---
|
| 65 |
ua = UserAgent()
|
| 66 |
|
| 67 |
def get_desktop_user_agent():
|
|
|
|
| 75 |
headers = {'User-Agent': selected_user_agent}
|
| 76 |
|
| 77 |
# تحديد مسار ملف الكوكيز داخل حاوية Docker
|
| 78 |
+
COOKIES_FILE_PATH = 'cookies.txt'
|
|
|
|
| 79 |
|
| 80 |
ydl_opts_video_info = {
|
| 81 |
'quiet': True,
|
|
|
|
| 86 |
'age_limit': 18,
|
| 87 |
'force_generic_extractor': False,
|
| 88 |
'http_headers': headers,
|
| 89 |
+
'cookiefile': COOKIES_FILE_PATH # سيبقى هذا الخيار لـ yt-dlp
|
| 90 |
}
|
| 91 |
|
| 92 |
# --- النموذج ---
|
|
|
|
| 117 |
|
| 118 |
# --- معالجة فيديو واحد فقط (نسخة 2) ---
|
| 119 |
def process_single_video2(video_url, loaded_quality_model, loaded_sentiment_pipeline, max_comments_per_video=50):
|
| 120 |
+
# تم حذف cookies_file من هنا
|
| 121 |
+
downloader = YoutubeCommentDownloader()
|
| 122 |
|
| 123 |
video_id = extract_video_id(video_url)
|
| 124 |
if not video_id:
|
|
|
|
| 126 |
return None
|
| 127 |
|
| 128 |
try:
|
| 129 |
+
time.sleep(random.uniform(1, 3))
|
|
|
|
| 130 |
|
| 131 |
with yt_dlp.YoutubeDL(ydl_opts_video_info) as ydl:
|
| 132 |
info_dict = ydl.extract_info(video_url, download=False)
|
| 133 |
|
|
|
|
| 134 |
logging.info(f"[فيديو: {video_url}] تم استخراج البيانات: {info_dict.keys()}")
|
| 135 |
|
| 136 |
if not info_dict or info_dict.get('is_live', False) or info_dict.get('age_limit', 0) > 0:
|
|
|
|
| 140 |
views = info_dict.get('view_count', 0)
|
| 141 |
likes = info_dict.get('like_count', 0)
|
| 142 |
|
|
|
|
| 143 |
logging.info(f"[فيديو: {video_url}] المشاهدات: {views}, الإعجابات: {likes}")
|
| 144 |
|
| 145 |
upload_date = info_dict.get('upload_date', 'Unknown')
|
| 146 |
publish_year = int(upload_date[:4]) if upload_date != 'Unknown' else datetime.now().year
|
| 147 |
|
| 148 |
+
time.sleep(random.uniform(1, 3))
|
|
|
|
| 149 |
|
|
|
|
| 150 |
sampled_comments = []
|
| 151 |
try:
|
| 152 |
for comment in downloader.get_comments_from_url(video_url):
|
|
|
|
| 162 |
comment_view_ratio = len(sampled_comments) / views if views > 0 else 0.0
|
| 163 |
engagement_score = like_view_ratio + comment_view_ratio
|
| 164 |
|
|
|
|
| 165 |
positive_comments = 0
|
| 166 |
negative_comments = 0
|
| 167 |
overall_sentiment = "لا توجد تعليقات كافية"
|
|
|
|
| 224 |
|
| 225 |
video_links = []
|
| 226 |
try:
|
| 227 |
+
# خيار cookiefile لـ yt-dlp يبقى هنا
|
| 228 |
+
with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True, 'playlist_items': '1:10', 'cookiefile': COOKIES_FILE_PATH}) as ydl:
|
| 229 |
playlist_info = ydl.extract_info(youtube_url, download=False)
|
| 230 |
if 'entries' in playlist_info:
|
| 231 |
for entry in playlist_info['entries'][:10]:
|
|
|
|
| 240 |
|
| 241 |
individual_results = []
|
| 242 |
|
|
|
|
| 243 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 244 |
futures = [
|
| 245 |
executor.submit(
|
|
|
|
| 255 |
result = future.result()
|
| 256 |
if result:
|
| 257 |
individual_results.append(result)
|
|
|
|
| 258 |
time.sleep(random.uniform(0.5, 1.5))
|
| 259 |
|
|
|
|
| 260 |
num_good_videos = sum(1 for r in individual_results if r and r.get('quality') == 'جيد')
|
| 261 |
total_positive_comments = sum(r.get('positive_comments', 0) for r in individual_results if r)
|
| 262 |
total_negative_comments = sum(r.get('negative_comments', 0) for r in individual_results if r)
|
|
|
|
| 265 |
total_videos = len(individual_results)
|
| 266 |
percent_good_videos = (num_good_videos / total_videos) * 100 if total_videos > 0 else 0
|
| 267 |
|
|
|
|
| 268 |
if percent_good_videos >= 70:
|
| 269 |
overall_quality = "جيد جداً"
|
| 270 |
elif percent_good_videos >= 50:
|
|
|
|
| 272 |
else:
|
| 273 |
overall_quality = "سيء"
|
| 274 |
|
|
|
|
| 275 |
WEIGHT_QUALITY = 0.6
|
| 276 |
WEIGHT_SENTIMENT = 0.4
|
| 277 |
|