Tarun Singh commited on
Commit
171a9b7
·
1 Parent(s): f608f91

fine tune code 2.2

Browse files
Files changed (2) hide show
  1. app.py +81 -103
  2. requirements.txt +1 -1
app.py CHANGED
@@ -17,13 +17,9 @@ app = Flask(__name__)
17
  api_keys = os.getenv("YOUTUBE_API_KEYS").split(',')
18
  os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
19
 
20
- global fine_tuned_tokenizer, interpreter, input_details, output_details, BATCH_SIZE
21
  tokenizer_path = 'model/saved_tokenizer'
22
  model_path = 'model/model_float16.tflite'
23
- # fine_tuned_tokenizer = None
24
- # interpreter = None
25
- # input_details = None
26
- # output_details = None
27
  fine_tuned_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
28
  interpreter = tflite.Interpreter(model_path=model_path)
29
  interpreter.allocate_tensors()
@@ -33,27 +29,9 @@ output_details = interpreter.get_output_details()
33
  BATCH_SIZE = 10
34
  SENTIMENT_LABELS = {0: 'Sadness', 1: 'Joy', 2: 'Love', 3: 'Annoyed', 4: 'Fear', 5: 'Surprise'}
35
 
36
- # def load_model_and_tokenizer():
37
- # global fine_tuned_tokenizer, interpreter, input_details, output_details, BATCH_SIZE
38
- # if fine_tuned_tokenizer is None or interpreter is None or input_details is None or output_details is None:
39
- # try:
40
- # fine_tuned_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
41
- # interpreter = tflite.Interpreter(model_path=model_path)
42
- # interpreter.allocate_tensors()
43
- # input_details = interpreter.get_input_details()
44
- # output_details = interpreter.get_output_details()
45
- # print("Model and tokenizer loaded successfully")
46
- # except Exception as e:
47
- # print(f"Error loading tokenizer or model: {e}")
48
-
49
-
50
-
51
  def build_youtube_client(api_key):
52
  return build('youtube', 'v3', developerKey=api_key)
53
 
54
- def rotate_api_key(current_index):
55
- return (current_index + 1) % len(api_keys)
56
-
57
  def get_comments(video_id, max_results=1000):
58
  comments = []
59
  current_index = 0
@@ -79,7 +57,7 @@ def get_comments(video_id, max_results=1000):
79
  if not next_page_token:
80
  break
81
  except Exception:
82
- current_index = rotate_api_key(current_index)
83
  youtube = build_youtube_client(api_keys[current_index])
84
  break
85
 
@@ -102,11 +80,8 @@ def trim_whitespace(s):
102
  return s.strip()
103
 
104
  def remove_emojis(text):
105
- text = emoji.replace_emoji(text, replace="")
106
- if text =='':
107
- return 'This is a empty comment'
108
- else :
109
- return text
110
 
111
  def detect_and_translate(comments: pd.DataFrame, required_count=10):
112
  translator = Translator()
@@ -120,6 +95,7 @@ def detect_and_translate(comments: pd.DataFrame, required_count=10):
120
  print(f"Rate limit hit. Retrying after {delay:.2f} seconds...")
121
  time.sleep(delay)
122
 
 
123
  for index, row in comments.iterrows():
124
  comment = row['Comment']
125
  try:
@@ -134,83 +110,81 @@ def detect_and_translate(comments: pd.DataFrame, required_count=10):
134
  print(f"Language detection error for comment at index {index}: {e}")
135
  other_language_comments.append('This is a neutral text')
136
 
137
- def prioritize_longer_comments(comments):
138
- return sorted(comments, key=lambda comment: len(comment.split()), reverse=True)
139
-
140
- english_comments = prioritize_longer_comments(english_comments)
141
-
142
- def prioritize_shorter_comments(comments):
143
- return sorted(comments, key=lambda comment: len(comment.split()))
144
-
145
- other_language_comments = prioritize_shorter_comments(other_language_comments)
146
- hindi_comments = prioritize_shorter_comments(hindi_comments)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- while len(translated_comments) < required_count and english_comments:
149
- translated_comments.append(english_comments.pop(0))
150
 
151
- while len(translated_comments) < required_count and other_language_comments:
 
152
  try:
153
- comment = other_language_comments.pop(0)
154
- for attempt in range(3):
155
- try:
156
- translation = translator.translate(comment, dest='en')
157
- if translation and translation.text:
158
- translated_comments.append(translation.text)
159
- else:
160
- translated_comments.append('This is a neutral text')
161
- break
162
- except (httpx.RequestError, httpx.TimeoutException) as e:
163
- print(f"Translation retry {attempt + 1} failed: {e}")
164
- time.sleep(1)
165
- except Exception as e:
166
- handle_rate_limit(attempt)
167
- else:
168
- translated_comments.append('This is a neutral text')
169
  except Exception as e:
170
- print(f"Translation error for other language comment: {e}")
171
- translated_comments.append('This is a neutral text')
172
 
173
- while len(translated_comments) < required_count and hindi_comments:
174
- try:
175
- comment = hindi_comments.pop(0)
176
- for attempt in range(3):
177
- try:
178
- translation = translator.translate(comment, dest='en')
179
- if translation and translation.text:
180
- translated_comments.append(translation.text)
181
- else:
182
- translated_comments.append('This is a neutral text')
183
- break
184
- except (httpx.RequestError, httpx.TimeoutException) as e:
185
- print(f"Translation retry {attempt + 1} failed: {e}")
186
- time.sleep(1)
187
- except Exception as e:
188
- handle_rate_limit(attempt)
189
- else:
190
- translated_comments.append('This is a neutral text')
191
- except Exception as e:
192
- print(f"Translation error for Hindi comment: {e}")
193
- translated_comments.append('This is a neutral text')
194
-
195
- result_df = pd.DataFrame(translated_comments, columns=['Comment'])
196
- return result_df
197
 
198
  def tflite_predict_batch(text_batch):
 
199
  inputs = fine_tuned_tokenizer(text_batch, return_tensors="np", padding="max_length", truncation=True, max_length=55)
200
-
201
  input_ids = inputs["input_ids"].astype(np.int64)
202
  attention_mask = inputs["attention_mask"].astype(np.int64)
203
  token_type_ids = inputs["token_type_ids"].astype(np.int64)
204
 
205
  results = []
206
- for i in range(input_ids.shape[0]):
207
- interpreter.set_tensor(input_details[1]['index'], np.expand_dims(input_ids[i], axis=0))
208
- interpreter.set_tensor(input_details[0]['index'], np.expand_dims(attention_mask[i], axis=0))
209
- interpreter.set_tensor(input_details[2]['index'], np.expand_dims(token_type_ids[i], axis=0))
210
- interpreter.invoke()
 
 
 
 
 
211
 
212
- output = interpreter.get_tensor(output_details[0]['index'])
213
- results.append(np.argmax(output, axis=1)[0])
214
 
215
  return results
216
 
@@ -218,6 +192,7 @@ def predict_sentiment(dataframe):
218
  comments = dataframe['Comment'].tolist()
219
  predictions = []
220
 
 
221
  batches = [comments[i:i + BATCH_SIZE] for i in range(0, len(comments), BATCH_SIZE)]
222
  for batch in batches:
223
  predictions.extend(tflite_predict_batch(batch))
@@ -225,17 +200,22 @@ def predict_sentiment(dataframe):
225
  return predictions
226
 
227
  def text_pre_processing(translated_comment: pd.DataFrame) -> pd.DataFrame:
228
- translated_comment['Comment']=translated_comment['Comment'].str.lower()
229
- translated_comment['Comment']=translated_comment['Comment'].apply(remove_emojis)
230
  return translated_comment
231
 
232
- def get_sentiment(comments_df : pd.DataFrame,comment_count=10) -> tuple:
233
- sentiment_counts = {'Sadness': 0, 'Joy': 0, 'Love': 0, 'Annoyed': 0, 'Fear': 0, 'Surprise': 0}
234
- comments_by_sentiment = {'Sadness': [], 'Joy': [], 'Love': [], 'Annoyed': [], 'Fear': [], 'Surprise': []}
235
- translated_comment = detect_and_translate(comments_df,comment_count)
 
 
236
  pre_processed_comments = text_pre_processing(translated_comment)
237
- # load_model_and_tokenizer()
 
238
  sentiment_indices = predict_sentiment(pre_processed_comments)
 
 
239
  for index, row in pre_processed_comments.iterrows():
240
  sentiment_label = SENTIMENT_LABELS[sentiment_indices[index]]
241
  sentiment_counts[sentiment_label] += 1
@@ -250,9 +230,7 @@ def index():
250
  video_id = extract_youtube_video_id(video_url)
251
  comment_count = int(request.form.get('comment_count', 10))
252
  if video_id:
253
- comment_to_fetch = comment_count
254
- if comment_count <= 30:
255
- comment_to_fetch=comment_count*10
256
  comments_df = get_comments(video_id, max_results=comment_to_fetch)
257
  if not comments_df.empty:
258
  sentiment_counts, comments_by_sentiment = get_sentiment(comments_df, comment_count)
@@ -269,4 +247,4 @@ def index():
269
  return render_template('index.html', sentiment_counts={}, comments_by_sentiment={})
270
 
271
  if __name__ == "__main__":
272
- app.run(debug=True)
 
17
  api_keys = os.getenv("YOUTUBE_API_KEYS").split(',')
18
  os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
19
 
20
+ # Load tokenizer and TFLite interpreter once at the start
21
  tokenizer_path = 'model/saved_tokenizer'
22
  model_path = 'model/model_float16.tflite'
 
 
 
 
23
  fine_tuned_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
24
  interpreter = tflite.Interpreter(model_path=model_path)
25
  interpreter.allocate_tensors()
 
29
  BATCH_SIZE = 10
30
  SENTIMENT_LABELS = {0: 'Sadness', 1: 'Joy', 2: 'Love', 3: 'Annoyed', 4: 'Fear', 5: 'Surprise'}
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def build_youtube_client(api_key):
33
  return build('youtube', 'v3', developerKey=api_key)
34
 
 
 
 
35
  def get_comments(video_id, max_results=1000):
36
  comments = []
37
  current_index = 0
 
57
  if not next_page_token:
58
  break
59
  except Exception:
60
+ current_index = (current_index + 1) % len(api_keys)
61
  youtube = build_youtube_client(api_keys[current_index])
62
  break
63
 
 
80
  return s.strip()
81
 
82
  def remove_emojis(text):
83
+ text = emoji.replace_emoji(text, replace="")
84
+ return 'This is an empty comment' if text == '' else text
 
 
 
85
 
86
  def detect_and_translate(comments: pd.DataFrame, required_count=10):
87
  translator = Translator()
 
95
  print(f"Rate limit hit. Retrying after {delay:.2f} seconds...")
96
  time.sleep(delay)
97
 
98
+ # Categorize comments by language
99
  for index, row in comments.iterrows():
100
  comment = row['Comment']
101
  try:
 
110
  print(f"Language detection error for comment at index {index}: {e}")
111
  other_language_comments.append('This is a neutral text')
112
 
113
+ # Prioritize comments based on the specified order
114
+ english_long_comments = sorted([c for c in english_comments if len(c.split()) > 5], key=lambda x: len(x.split()), reverse=True)
115
+ english_short_comments = sorted([c for c in english_comments if len(c.split()) <= 5], key=lambda x: len(x.split()), reverse=True)
116
+ other_short_comments = sorted([c for c in other_language_comments if len(c.split()) <= 5], key=lambda x: len(x.split()))
117
+ other_long_comments = sorted([c for c in other_language_comments if len(c.split()) > 5], key=lambda x: len(x.split()))
118
+ hindi_short_comments = sorted([c for c in hindi_comments if len(c.split()) <= 5], key=lambda x: len(x.split()))
119
+ hindi_long_comments = sorted([c for c in hindi_comments if len(c.split()) > 5], key=lambda x: len(x.split()))
120
+
121
+ # Fill translated_comments based on the priority order
122
+ prioritized_comments = (
123
+ english_long_comments +
124
+ english_short_comments +
125
+ other_short_comments +
126
+ other_long_comments +
127
+ hindi_short_comments +
128
+ hindi_long_comments
129
+ )
130
+
131
+ # Add high-priority English comments directly without translation
132
+ while len(translated_comments) < required_count and prioritized_comments:
133
+ next_comment = prioritized_comments.pop(0)
134
+ if next_comment in english_comments:
135
+ translated_comments.append(next_comment)
136
+ else:
137
+ break # Break to start translating other languages
138
 
139
+ # Collect remaining comments for translation, up to the required count
140
+ comments_to_translate = prioritized_comments[:max(0, required_count - len(translated_comments))]
141
 
142
+ # Batch translate the selected comments
143
+ if comments_to_translate:
144
  try:
145
+ translations = []
146
+ for comment in comments_to_translate:
147
+ for attempt in range(3):
148
+ try:
149
+ translation = translator.translate(comment, dest='en')
150
+ translations.append(translation.text if translation and translation.text else 'This is a neutral text')
151
+ break
152
+ except (httpx.RequestError, httpx.TimeoutException) as e:
153
+ print(f"Translation retry {attempt + 1} failed: {e}")
154
+ time.sleep(1)
155
+ except Exception as e:
156
+ handle_rate_limit(attempt)
157
+ else:
158
+ translations.append('This is a neutral text') # Fallback if all retries fail
159
+ translated_comments.extend(translations)
 
160
  except Exception as e:
161
+ print(f"Batch translation error: {e}")
162
+ translated_comments.extend(['This is a neutral text'] * len(comments_to_translate))
163
 
164
+ return pd.DataFrame(translated_comments[:required_count], columns=['Comment'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
  def tflite_predict_batch(text_batch):
167
+ # Tokenize the entire batch at once
168
  inputs = fine_tuned_tokenizer(text_batch, return_tensors="np", padding="max_length", truncation=True, max_length=55)
169
+
170
  input_ids = inputs["input_ids"].astype(np.int64)
171
  attention_mask = inputs["attention_mask"].astype(np.int64)
172
  token_type_ids = inputs["token_type_ids"].astype(np.int64)
173
 
174
  results = []
175
+ # Reuse tensors in the batch inference process
176
+ interpreter.resize_tensor_input(input_details[1]['index'], input_ids.shape)
177
+ interpreter.resize_tensor_input(input_details[0]['index'], attention_mask.shape)
178
+ interpreter.resize_tensor_input(input_details[2]['index'], token_type_ids.shape)
179
+ interpreter.allocate_tensors()
180
+
181
+ interpreter.set_tensor(input_details[1]['index'], input_ids)
182
+ interpreter.set_tensor(input_details[0]['index'], attention_mask)
183
+ interpreter.set_tensor(input_details[2]['index'], token_type_ids)
184
+ interpreter.invoke()
185
 
186
+ output = interpreter.get_tensor(output_details[0]['index'])
187
+ results.extend(np.argmax(output, axis=1))
188
 
189
  return results
190
 
 
192
  comments = dataframe['Comment'].tolist()
193
  predictions = []
194
 
195
+ # Process comments in batches
196
  batches = [comments[i:i + BATCH_SIZE] for i in range(0, len(comments), BATCH_SIZE)]
197
  for batch in batches:
198
  predictions.extend(tflite_predict_batch(batch))
 
200
  return predictions
201
 
202
  def text_pre_processing(translated_comment: pd.DataFrame) -> pd.DataFrame:
203
+ # Only remove emojis, avoid lowercasing
204
+ translated_comment['Comment'] = translated_comment['Comment'].apply(remove_emojis)
205
  return translated_comment
206
 
207
+ def get_sentiment(comments_df: pd.DataFrame, comment_count=10) -> tuple:
208
+ sentiment_counts = {label: 0 for label in SENTIMENT_LABELS.values()}
209
+ comments_by_sentiment = {label: [] for label in SENTIMENT_LABELS.values()}
210
+
211
+ # Detect language, translate comments, and preprocess text
212
+ translated_comment = detect_and_translate(comments_df, comment_count)
213
  pre_processed_comments = text_pre_processing(translated_comment)
214
+
215
+ # Predict sentiments for the pre-processed comments
216
  sentiment_indices = predict_sentiment(pre_processed_comments)
217
+
218
+ # Organize results by sentiment
219
  for index, row in pre_processed_comments.iterrows():
220
  sentiment_label = SENTIMENT_LABELS[sentiment_indices[index]]
221
  sentiment_counts[sentiment_label] += 1
 
230
  video_id = extract_youtube_video_id(video_url)
231
  comment_count = int(request.form.get('comment_count', 10))
232
  if video_id:
233
+ comment_to_fetch = comment_count * 10 if comment_count <= 30 else comment_count
 
 
234
  comments_df = get_comments(video_id, max_results=comment_to_fetch)
235
  if not comments_df.empty:
236
  sentiment_counts, comments_by_sentiment = get_sentiment(comments_df, comment_count)
 
247
  return render_template('index.html', sentiment_counts={}, comments_by_sentiment={})
248
 
249
  if __name__ == "__main__":
250
+ app.run(debug=True)
requirements.txt CHANGED
@@ -5,5 +5,5 @@ pandas
5
  Flask
6
  emoji
7
  gunicorn
8
- googletrans==4.0.0-rc1
9
  pycld3
 
5
  Flask
6
  emoji
7
  gunicorn
8
+ googletrans==3.1.0a0
9
  pycld3