Spaces:

Tarun-singh
/

Youtube-Video-Comments-Sentiment-Analysis

Running

App Files Files Community

Tarun Singh commited on Sep 8, 2024

Commit

171a9b7

1 Parent(s): f608f91

fine tune code 2.2

Browse files

Files changed (2) hide show

app.py +81 -103
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -17,13 +17,9 @@ app = Flask(__name__)
 api_keys = os.getenv("YOUTUBE_API_KEYS").split(',')
 os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
-global fine_tuned_tokenizer, interpreter, input_details, output_details, BATCH_SIZE
 tokenizer_path = 'model/saved_tokenizer'
 model_path = 'model/model_float16.tflite'
-# fine_tuned_tokenizer = None
-# interpreter = None
-# input_details = None
-# output_details = None
 fine_tuned_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 interpreter = tflite.Interpreter(model_path=model_path)
 interpreter.allocate_tensors()
@@ -33,27 +29,9 @@ output_details = interpreter.get_output_details()
 BATCH_SIZE = 10
 SENTIMENT_LABELS = {0: 'Sadness', 1: 'Joy', 2: 'Love', 3: 'Annoyed', 4: 'Fear', 5: 'Surprise'}
-# def load_model_and_tokenizer():
-#     global fine_tuned_tokenizer, interpreter, input_details, output_details, BATCH_SIZE
-#     if fine_tuned_tokenizer is None or interpreter is None or input_details is None or output_details is None:
-#         try:
-#             fine_tuned_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-#             interpreter = tflite.Interpreter(model_path=model_path)
-#             interpreter.allocate_tensors()
-#             input_details = interpreter.get_input_details()
-#             output_details = interpreter.get_output_details()
-#             print("Model and tokenizer loaded successfully")
-#         except Exception as e:
-#             print(f"Error loading tokenizer or model: {e}")
 def build_youtube_client(api_key):
     return build('youtube', 'v3', developerKey=api_key)
-def rotate_api_key(current_index):
-    return (current_index + 1) % len(api_keys)
 def get_comments(video_id, max_results=1000):
     comments = []
     current_index = 0
@@ -79,7 +57,7 @@ def get_comments(video_id, max_results=1000):
             if not next_page_token:
                 break
         except Exception:
-            current_index = rotate_api_key(current_index)
             youtube = build_youtube_client(api_keys[current_index])
             break
@@ -102,11 +80,8 @@ def trim_whitespace(s):
     return s.strip()
 def remove_emojis(text):
-  text = emoji.replace_emoji(text, replace="")
-  if text =='':
-    return 'This is a empty comment'
-  else :
-    return text
 def detect_and_translate(comments: pd.DataFrame, required_count=10):
     translator = Translator()
@@ -120,6 +95,7 @@ def detect_and_translate(comments: pd.DataFrame, required_count=10):
         print(f"Rate limit hit. Retrying after {delay:.2f} seconds...")
         time.sleep(delay)
     for index, row in comments.iterrows():
         comment = row['Comment']
         try:
@@ -134,83 +110,81 @@ def detect_and_translate(comments: pd.DataFrame, required_count=10):
             print(f"Language detection error for comment at index {index}: {e}")
             other_language_comments.append('This is a neutral text')
-    def prioritize_longer_comments(comments):
-        return sorted(comments, key=lambda comment: len(comment.split()), reverse=True)
-    english_comments = prioritize_longer_comments(english_comments)
-    def prioritize_shorter_comments(comments):
-        return sorted(comments, key=lambda comment: len(comment.split()))
-    other_language_comments = prioritize_shorter_comments(other_language_comments)
-    hindi_comments = prioritize_shorter_comments(hindi_comments)
-    while len(translated_comments) < required_count and english_comments:
-        translated_comments.append(english_comments.pop(0))
-    while len(translated_comments) < required_count and other_language_comments:
         try:
-            comment = other_language_comments.pop(0)
-            for attempt in range(3):
-                try:
-                    translation = translator.translate(comment, dest='en')
-                    if translation and translation.text:
-                        translated_comments.append(translation.text)
-                    else:
-                        translated_comments.append('This is a neutral text')
-                    break
-                except (httpx.RequestError, httpx.TimeoutException) as e:
-                    print(f"Translation retry {attempt + 1} failed: {e}")
-                    time.sleep(1)
-                except Exception as e:
-                    handle_rate_limit(attempt)
-            else:
-                translated_comments.append('This is a neutral text')
         except Exception as e:
-            print(f"Translation error for other language comment: {e}")
-            translated_comments.append('This is a neutral text')
-    while len(translated_comments) < required_count and hindi_comments:
-        try:
-            comment = hindi_comments.pop(0)
-            for attempt in range(3):
-                try:
-                    translation = translator.translate(comment, dest='en')
-                    if translation and translation.text:
-                        translated_comments.append(translation.text)
-                    else:
-                        translated_comments.append('This is a neutral text')
-                    break
-                except (httpx.RequestError, httpx.TimeoutException) as e:
-                    print(f"Translation retry {attempt + 1} failed: {e}")
-                    time.sleep(1)
-                except Exception as e:
-                    handle_rate_limit(attempt)
-            else:
-                translated_comments.append('This is a neutral text')
-        except Exception as e:
-            print(f"Translation error for Hindi comment: {e}")
-            translated_comments.append('This is a neutral text')
-    result_df = pd.DataFrame(translated_comments, columns=['Comment'])
-    return result_df
 def tflite_predict_batch(text_batch):
     inputs = fine_tuned_tokenizer(text_batch, return_tensors="np", padding="max_length", truncation=True, max_length=55)
     input_ids = inputs["input_ids"].astype(np.int64)
     attention_mask = inputs["attention_mask"].astype(np.int64)
     token_type_ids = inputs["token_type_ids"].astype(np.int64)
     results = []
-    for i in range(input_ids.shape[0]):
-        interpreter.set_tensor(input_details[1]['index'], np.expand_dims(input_ids[i], axis=0))
-        interpreter.set_tensor(input_details[0]['index'], np.expand_dims(attention_mask[i], axis=0))
-        interpreter.set_tensor(input_details[2]['index'], np.expand_dims(token_type_ids[i], axis=0))
-        interpreter.invoke()
-        output = interpreter.get_tensor(output_details[0]['index'])
-        results.append(np.argmax(output, axis=1)[0])
     return results
@@ -218,6 +192,7 @@ def predict_sentiment(dataframe):
     comments = dataframe['Comment'].tolist()
     predictions = []
     batches = [comments[i:i + BATCH_SIZE] for i in range(0, len(comments), BATCH_SIZE)]
     for batch in batches:
         predictions.extend(tflite_predict_batch(batch))
@@ -225,17 +200,22 @@ def predict_sentiment(dataframe):
     return predictions
 def text_pre_processing(translated_comment: pd.DataFrame) -> pd.DataFrame:
-    translated_comment['Comment']=translated_comment['Comment'].str.lower()
-    translated_comment['Comment']=translated_comment['Comment'].apply(remove_emojis)
     return translated_comment
-def get_sentiment(comments_df : pd.DataFrame,comment_count=10) -> tuple:
-    sentiment_counts = {'Sadness': 0, 'Joy': 0, 'Love': 0, 'Annoyed': 0, 'Fear': 0, 'Surprise': 0}
-    comments_by_sentiment = {'Sadness': [], 'Joy': [], 'Love': [], 'Annoyed': [], 'Fear': [], 'Surprise': []}
-    translated_comment = detect_and_translate(comments_df,comment_count)
     pre_processed_comments = text_pre_processing(translated_comment)
-    # load_model_and_tokenizer()
     sentiment_indices = predict_sentiment(pre_processed_comments)
     for index, row in pre_processed_comments.iterrows():
         sentiment_label = SENTIMENT_LABELS[sentiment_indices[index]]
         sentiment_counts[sentiment_label] += 1
@@ -250,9 +230,7 @@ def index():
         video_id = extract_youtube_video_id(video_url)
         comment_count = int(request.form.get('comment_count', 10))
         if video_id:
-            comment_to_fetch = comment_count
-            if comment_count <= 30:
-                comment_to_fetch=comment_count*10
             comments_df = get_comments(video_id, max_results=comment_to_fetch)
             if not comments_df.empty:
                 sentiment_counts, comments_by_sentiment = get_sentiment(comments_df, comment_count)
@@ -269,4 +247,4 @@ def index():
     return render_template('index.html', sentiment_counts={}, comments_by_sentiment={})
 if __name__ == "__main__":
-    app.run(debug=True)

 api_keys = os.getenv("YOUTUBE_API_KEYS").split(',')
 os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+# Load tokenizer and TFLite interpreter once at the start
 tokenizer_path = 'model/saved_tokenizer'
 model_path = 'model/model_float16.tflite'
 fine_tuned_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 interpreter = tflite.Interpreter(model_path=model_path)
 interpreter.allocate_tensors()
 BATCH_SIZE = 10
 SENTIMENT_LABELS = {0: 'Sadness', 1: 'Joy', 2: 'Love', 3: 'Annoyed', 4: 'Fear', 5: 'Surprise'}
 def build_youtube_client(api_key):
     return build('youtube', 'v3', developerKey=api_key)
 def get_comments(video_id, max_results=1000):
     comments = []
     current_index = 0
             if not next_page_token:
                 break
         except Exception:
+            current_index = (current_index + 1) % len(api_keys)
             youtube = build_youtube_client(api_keys[current_index])
             break
     return s.strip()
 def remove_emojis(text):
+    text = emoji.replace_emoji(text, replace="")
+    return 'This is an empty comment' if text == '' else text
 def detect_and_translate(comments: pd.DataFrame, required_count=10):
     translator = Translator()
         print(f"Rate limit hit. Retrying after {delay:.2f} seconds...")
         time.sleep(delay)
+    # Categorize comments by language
     for index, row in comments.iterrows():
         comment = row['Comment']
         try:
             print(f"Language detection error for comment at index {index}: {e}")
             other_language_comments.append('This is a neutral text')
+    # Prioritize comments based on the specified order
+    english_long_comments = sorted([c for c in english_comments if len(c.split()) > 5], key=lambda x: len(x.split()), reverse=True)
+    english_short_comments = sorted([c for c in english_comments if len(c.split()) <= 5], key=lambda x: len(x.split()), reverse=True)
+    other_short_comments = sorted([c for c in other_language_comments if len(c.split()) <= 5], key=lambda x: len(x.split()))
+    other_long_comments = sorted([c for c in other_language_comments if len(c.split()) > 5], key=lambda x: len(x.split()))
+    hindi_short_comments = sorted([c for c in hindi_comments if len(c.split()) <= 5], key=lambda x: len(x.split()))
+    hindi_long_comments = sorted([c for c in hindi_comments if len(c.split()) > 5], key=lambda x: len(x.split()))
+    # Fill translated_comments based on the priority order
+    prioritized_comments = (
+        english_long_comments +
+        english_short_comments +
+        other_short_comments +
+        other_long_comments +
+        hindi_short_comments +
+        hindi_long_comments
+    )
+    # Add high-priority English comments directly without translation
+    while len(translated_comments) < required_count and prioritized_comments:
+        next_comment = prioritized_comments.pop(0)
+        if next_comment in english_comments:
+            translated_comments.append(next_comment)
+        else:
+            break  # Break to start translating other languages
+    # Collect remaining comments for translation, up to the required count
+    comments_to_translate = prioritized_comments[:max(0, required_count - len(translated_comments))]
+    # Batch translate the selected comments
+    if comments_to_translate:
         try:
+            translations = []
+            for comment in comments_to_translate:
+                for attempt in range(3):
+                    try:
+                        translation = translator.translate(comment, dest='en')
+                        translations.append(translation.text if translation and translation.text else 'This is a neutral text')
+                        break
+                    except (httpx.RequestError, httpx.TimeoutException) as e:
+                        print(f"Translation retry {attempt + 1} failed: {e}")
+                        time.sleep(1)
+                    except Exception as e:
+                        handle_rate_limit(attempt)
+                else:
+                    translations.append('This is a neutral text')  # Fallback if all retries fail
+            translated_comments.extend(translations)
         except Exception as e:
+            print(f"Batch translation error: {e}")
+            translated_comments.extend(['This is a neutral text'] * len(comments_to_translate))
+    return pd.DataFrame(translated_comments[:required_count], columns=['Comment'])
 def tflite_predict_batch(text_batch):
+    # Tokenize the entire batch at once
     inputs = fine_tuned_tokenizer(text_batch, return_tensors="np", padding="max_length", truncation=True, max_length=55)
     input_ids = inputs["input_ids"].astype(np.int64)
     attention_mask = inputs["attention_mask"].astype(np.int64)
     token_type_ids = inputs["token_type_ids"].astype(np.int64)
     results = []
+    # Reuse tensors in the batch inference process
+    interpreter.resize_tensor_input(input_details[1]['index'], input_ids.shape)
+    interpreter.resize_tensor_input(input_details[0]['index'], attention_mask.shape)
+    interpreter.resize_tensor_input(input_details[2]['index'], token_type_ids.shape)
+    interpreter.allocate_tensors()
+    interpreter.set_tensor(input_details[1]['index'], input_ids)
+    interpreter.set_tensor(input_details[0]['index'], attention_mask)
+    interpreter.set_tensor(input_details[2]['index'], token_type_ids)
+    interpreter.invoke()
+    output = interpreter.get_tensor(output_details[0]['index'])
+    results.extend(np.argmax(output, axis=1))
     return results
     comments = dataframe['Comment'].tolist()
     predictions = []
+    # Process comments in batches
     batches = [comments[i:i + BATCH_SIZE] for i in range(0, len(comments), BATCH_SIZE)]
     for batch in batches:
         predictions.extend(tflite_predict_batch(batch))
     return predictions
 def text_pre_processing(translated_comment: pd.DataFrame) -> pd.DataFrame:
+    # Only remove emojis, avoid lowercasing
+    translated_comment['Comment'] = translated_comment['Comment'].apply(remove_emojis)
     return translated_comment
+def get_sentiment(comments_df: pd.DataFrame, comment_count=10) -> tuple:
+    sentiment_counts = {label: 0 for label in SENTIMENT_LABELS.values()}
+    comments_by_sentiment = {label: [] for label in SENTIMENT_LABELS.values()}
+    # Detect language, translate comments, and preprocess text
+    translated_comment = detect_and_translate(comments_df, comment_count)
     pre_processed_comments = text_pre_processing(translated_comment)
+    # Predict sentiments for the pre-processed comments
     sentiment_indices = predict_sentiment(pre_processed_comments)
+    # Organize results by sentiment
     for index, row in pre_processed_comments.iterrows():
         sentiment_label = SENTIMENT_LABELS[sentiment_indices[index]]
         sentiment_counts[sentiment_label] += 1
         video_id = extract_youtube_video_id(video_url)
         comment_count = int(request.form.get('comment_count', 10))
         if video_id:
+            comment_to_fetch = comment_count * 10 if comment_count <= 30 else comment_count
             comments_df = get_comments(video_id, max_results=comment_to_fetch)
             if not comments_df.empty:
                 sentiment_counts, comments_by_sentiment = get_sentiment(comments_df, comment_count)
     return render_template('index.html', sentiment_counts={}, comments_by_sentiment={})
 if __name__ == "__main__":
+    app.run(debug=True)

requirements.txt CHANGED Viewed

@@ -5,5 +5,5 @@ pandas
 Flask
 emoji
 gunicorn
-googletrans==4.0.0-rc1
 pycld3

 Flask
 emoji
 gunicorn
+googletrans==3.1.0a0
 pycld3