parthnuwal7 commited on
Commit
2b4639d
·
1 Parent(s): ac8959c

Integrated PyABSA again

Browse files
requirements-docker.txt CHANGED
@@ -18,13 +18,12 @@ numpy>=1.24.0,<1.26.0
18
  scikit-learn>=1.3.0,<1.4.0
19
  langdetect>=1.0.9
20
 
21
- # ===== Optional: Full ML Stack (Uncomment if using local models) =====
22
- # torch>=2.0.0,<2.2.0
23
- # transformers>=4.30.0,<4.37.0
24
- # pyabsa>=2.4.0,<3.0.0
25
- # sentencepiece>=0.1.99
26
- # sacremoses>=0.0.53
27
- # faiss-cpu>=1.7.4
28
 
29
  # ===== Streamlit (for dual interface) =====
30
  streamlit>=1.28.0,<1.30.0
 
18
  scikit-learn>=1.3.0,<1.4.0
19
  langdetect>=1.0.9
20
 
21
+ # ===== ML Stack for PyABSA =====
22
+ torch>=2.0.0,<2.2.0
23
+ transformers>=4.30.0,<4.37.0
24
+ pyabsa>=2.4.0,<3.0.0
25
+ sentencepiece>=0.1.99
26
+ sacremoses>=0.0.53
 
27
 
28
  # ===== Streamlit (for dual interface) =====
29
  streamlit>=1.28.0,<1.30.0
src/utils/data_processor.py CHANGED
@@ -189,14 +189,28 @@ class TranslationService:
189
 
190
 
191
  class ABSAProcessor:
192
- """Enhanced ABSA using Hugging Face Inference API - much more reliable for production."""
193
 
194
  def __init__(self):
195
- self.api_token = self._get_hf_token()
196
- self.sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
197
- self.base_url = "https://api-inference.huggingface.co/models"
198
- self.task_manager = None # Will be set by DataProcessor
199
- logger.info("Initialized HF Inference API for ABSA processing")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  def set_task_manager(self, task_manager):
202
  """Set task manager for cancellation support."""
@@ -249,10 +263,10 @@ class ABSAProcessor:
249
 
250
  def extract_aspects_and_sentiments(self, reviews: List[str], task_id: Optional[str] = None) -> List[Dict[str, Any]]:
251
  """
252
- Extract aspects and sentiments using HF Inference API and rule-based aspects with cancellation support.
253
 
254
  Args:
255
- reviews: List of review texts
256
  task_id: Optional task ID for cancellation tracking
257
 
258
  Returns:
@@ -260,7 +274,7 @@ class ABSAProcessor:
260
  """
261
  import gc
262
 
263
- logger.info(f"Processing {len(reviews)} reviews with HF Inference API")
264
 
265
  processed_results = []
266
  batch_size = 5 # Process 5 reviews at a time for responsive cancellation
@@ -283,22 +297,17 @@ class ABSAProcessor:
283
  if review_idx % 10 == 0: # Progress logging
284
  logger.info(f"Processing review {review_idx+1}/{len(reviews)}")
285
 
286
- # Get sentiment from HF API
287
- sentiment = self._get_hf_sentiment(review)
288
-
289
- # Extract aspects using rule-based approach
290
- aspects = self._extract_simple_aspects(review)
 
 
 
 
291
 
292
- processed_result = {
293
- 'sentence': review,
294
- 'aspects': aspects,
295
- 'sentiments': [sentiment] * len(aspects),
296
- 'positions': [[0, len(review)]] * len(aspects),
297
- 'confidence_scores': [0.8] * len(aspects), # HF models are quite confident
298
- 'tokens': review.split(),
299
- 'iob_tags': ['O'] * len(review.split())
300
- }
301
- processed_results.append(processed_result)
302
 
303
  # Update progress after each batch (50-90% range)
304
  if task_id and self.task_manager:
@@ -308,6 +317,52 @@ class ABSAProcessor:
308
  logger.info(f"Successfully processed {len(processed_results)} reviews")
309
  return processed_results
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  def _get_hf_sentiment(self, text: str) -> str:
312
  """Get sentiment from HF Inference API with fallback."""
313
  if not self.api_token:
@@ -344,21 +399,33 @@ class ABSAProcessor:
344
  return self._get_rule_based_sentiment(text)
345
 
346
  def _get_rule_based_sentiment(self, review: str) -> str:
347
- """Fallback rule-based sentiment analysis."""
348
  review_lower = review.lower()
349
 
350
  # Enhanced sentiment words
351
  positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'best', 'awesome',
352
  'fantastic', 'wonderful', 'perfect', 'satisfied', 'happy', 'pleased',
353
- 'outstanding', 'brilliant', 'superb', 'delighted', 'impressed']
 
354
 
355
  negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'horrible', 'poor',
356
  'disappointing', 'frustrated', 'angry', 'broken', 'failed', 'useless',
357
- 'pathetic', 'disgusting', 'annoying', 'waste', 'regret']
 
 
 
 
 
 
358
 
359
  pos_count = sum(1 for word in positive_words if word in review_lower)
360
  neg_count = sum(1 for word in negative_words if word in review_lower)
361
 
 
 
 
 
 
362
  if pos_count > neg_count:
363
  return 'Positive'
364
  elif neg_count > pos_count:
@@ -371,18 +438,22 @@ class ABSAProcessor:
371
  review_lower = review.lower()
372
  aspects = []
373
 
374
- # Enhanced aspect keywords
375
  aspect_keywords = {
376
- 'Quality': ['quality', 'build', 'material', 'construction', 'durability', 'solid', 'sturdy', 'cheap', 'flimsy'],
377
- 'Price': ['price', 'cost', 'expensive', 'cheap', 'value', 'money', 'affordable', 'budget', 'worth'],
378
- 'Service': ['service', 'support', 'help', 'staff', 'customer', 'response', 'team', 'representative'],
379
- 'Delivery': ['delivery', 'shipping', 'fast', 'quick', 'slow', 'delayed', 'arrive', 'package'],
380
- 'Design': ['design', 'look', 'appearance', 'beautiful', 'ugly', 'style', 'color', 'aesthetic'],
381
- 'Performance': ['performance', 'speed', 'fast', 'slow', 'efficiency', 'works', 'function', 'smooth'],
382
- 'Usability': ['easy', 'difficult', 'user', 'interface', 'intuitive', 'complex', 'simple', 'confusing'],
383
- 'Features': ['feature', 'function', 'capability', 'option', 'setting', 'mode', 'tool'],
384
- 'Size': ['size', 'big', 'small', 'large', 'compact', 'tiny', 'huge', 'dimension'],
385
- 'Battery': ['battery', 'charge', 'power', 'energy', 'last', 'drain', 'life']
 
 
 
 
386
  }
387
 
388
  for aspect, keywords in aspect_keywords.items():
 
189
 
190
 
191
  class ABSAProcessor:
192
+ """Enhanced ABSA using PyABSA for accurate aspect extraction and sentiment analysis."""
193
 
194
  def __init__(self):
195
+ self.model = None
196
+ self.task_manager = None
197
+ self._load_pyabsa_model()
198
+ logger.info("Initialized PyABSA for ABSA processing")
199
+
200
+ def _load_pyabsa_model(self):
201
+ """Load PyABSA multilingual model with caching."""
202
+ try:
203
+ import pyabsa
204
+ from pyabsa import ATEPCCheckpointManager
205
+
206
+ # Use multilingual checkpoint - works for English and some Hindi
207
+ checkpoint = ATEPCCheckpointManager.get_checkpoint('multilingual')
208
+ self.model = pyabsa.load_aspect_extractor(checkpoint=checkpoint)
209
+ logger.info("PyABSA model loaded successfully")
210
+
211
+ except Exception as e:
212
+ logger.warning(f"Failed to load PyABSA model: {str(e)}. Using fallback.")
213
+ self.model = None
214
 
215
  def set_task_manager(self, task_manager):
216
  """Set task manager for cancellation support."""
 
263
 
264
  def extract_aspects_and_sentiments(self, reviews: List[str], task_id: Optional[str] = None) -> List[Dict[str, Any]]:
265
  """
266
+ Extract aspects and sentiments using PyABSA with fallback and cancellation support.
267
 
268
  Args:
269
+ reviews: List of review texts (preferably in English after translation)
270
  task_id: Optional task ID for cancellation tracking
271
 
272
  Returns:
 
274
  """
275
  import gc
276
 
277
+ logger.info(f"Processing {len(reviews)} reviews with PyABSA")
278
 
279
  processed_results = []
280
  batch_size = 5 # Process 5 reviews at a time for responsive cancellation
 
297
  if review_idx % 10 == 0: # Progress logging
298
  logger.info(f"Processing review {review_idx+1}/{len(reviews)}")
299
 
300
+ # Try PyABSA first, fallback to rule-based if unavailable
301
+ if self.model is not None:
302
+ try:
303
+ result = self._extract_with_pyabsa(review)
304
+ except Exception as e:
305
+ logger.warning(f"PyABSA failed for review {review_idx}: {str(e)}, using fallback")
306
+ result = self._extract_with_fallback(review)
307
+ else:
308
+ result = self._extract_with_fallback(review)
309
 
310
+ processed_results.append(result)
 
 
 
 
 
 
 
 
 
311
 
312
  # Update progress after each batch (50-90% range)
313
  if task_id and self.task_manager:
 
317
  logger.info(f"Successfully processed {len(processed_results)} reviews")
318
  return processed_results
319
 
320
+ def _extract_with_pyabsa(self, review: str) -> Dict[str, Any]:
321
+ """Extract aspects and sentiments using PyABSA model."""
322
+ result = self.model.predict(review, print_result=False, save_result=False)
323
+
324
+ # PyABSA returns: aspect, sentiment, confidence, position
325
+ aspects = result.get('aspect', [])
326
+ sentiments = result.get('sentiment', [])
327
+ positions = result.get('position', [])
328
+ confidence_scores = result.get('confidence', [])
329
+
330
+ # Handle single aspect case
331
+ if not isinstance(aspects, list):
332
+ aspects = [aspects] if aspects else []
333
+ sentiments = [sentiments] if sentiments else []
334
+ positions = [positions] if positions else []
335
+ confidence_scores = [confidence_scores] if confidence_scores else []
336
+
337
+ # If no aspects found, use fallback
338
+ if not aspects:
339
+ return self._extract_with_fallback(review)
340
+
341
+ return {
342
+ 'sentence': review,
343
+ 'aspects': aspects,
344
+ 'sentiments': sentiments,
345
+ 'positions': positions,
346
+ 'confidence_scores': confidence_scores,
347
+ 'tokens': review.split(),
348
+ 'iob_tags': ['O'] * len(review.split())
349
+ }
350
+
351
+ def _extract_with_fallback(self, review: str) -> Dict[str, Any]:
352
+ """Fallback rule-based extraction when PyABSA is unavailable."""
353
+ sentiment = self._get_rule_based_sentiment(review)
354
+ aspects = self._extract_simple_aspects(review)
355
+
356
+ return {
357
+ 'sentence': review,
358
+ 'aspects': aspects,
359
+ 'sentiments': [sentiment] * len(aspects),
360
+ 'positions': [[0, len(review)]] * len(aspects),
361
+ 'confidence_scores': [0.7] * len(aspects), # Lower confidence for rule-based
362
+ 'tokens': review.split(),
363
+ 'iob_tags': ['O'] * len(review.split())
364
+ }
365
+
366
  def _get_hf_sentiment(self, text: str) -> str:
367
  """Get sentiment from HF Inference API with fallback."""
368
  if not self.api_token:
 
399
  return self._get_rule_based_sentiment(text)
400
 
401
  def _get_rule_based_sentiment(self, review: str) -> str:
402
+ """Fallback rule-based sentiment analysis with enhanced negative detection."""
403
  review_lower = review.lower()
404
 
405
  # Enhanced sentiment words
406
  positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'best', 'awesome',
407
  'fantastic', 'wonderful', 'perfect', 'satisfied', 'happy', 'pleased',
408
+ 'outstanding', 'brilliant', 'superb', 'delighted', 'impressed', 'working',
409
+ 'अच्छा', 'बढ़िया', 'शानदार', 'बेहतरीन']
410
 
411
  negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'horrible', 'poor',
412
  'disappointing', 'frustrated', 'angry', 'broken', 'failed', 'useless',
413
+ 'pathetic', 'disgusting', 'annoying', 'waste', 'regret', 'problem', 'issue',
414
+ 'not working', 'doesn\'t work', 'never', 'delayed', 'late', 'slow', 'error',
415
+ 'खराब', 'समस्या', 'देर', 'नहीं', 'बुरा']
416
+
417
+ # Strong negative phrases (count as 2 points)
418
+ negative_phrases = ['too late', 'never comes', 'not received', 'doesn\'t arrive',
419
+ 'delayed', 'not working', 'बहुत देर', 'नहीं आता']
420
 
421
  pos_count = sum(1 for word in positive_words if word in review_lower)
422
  neg_count = sum(1 for word in negative_words if word in review_lower)
423
 
424
+ # Check for negative phrases (stronger signal)
425
+ for phrase in negative_phrases:
426
+ if phrase in review_lower:
427
+ neg_count += 2
428
+
429
  if pos_count > neg_count:
430
  return 'Positive'
431
  elif neg_count > pos_count:
 
438
  review_lower = review.lower()
439
  aspects = []
440
 
441
+ # Enhanced aspect keywords with Hindi/English variants
442
  aspect_keywords = {
443
+ 'OTP/Verification': ['otp', 'atp', 'verification', 'verify', 'code', 'pin', 'authentication', 'ओटीपी', 'कोड', 'सत्यापन'],
444
+ 'Login/Account': ['login', 'sign in', 'signin', 'account', 'password', 'username', 'register', 'signup', 'लॉगिन', 'खाता'],
445
+ 'App Performance': ['app', 'application', 'crash', 'freeze', 'hang', 'loading', 'lag', 'slow', 'एप', 'एप्लिकेशन'],
446
+ 'Payment': ['payment', 'pay', 'transaction', 'refund', 'money', 'bank', 'upi', 'wallet', 'भुगतान', 'पैसा'],
447
+ 'Quality': ['quality', 'build', 'material', 'construction', 'durability', 'solid', 'sturdy', 'cheap', 'flimsy', 'गुणवत्ता'],
448
+ 'Price': ['price', 'cost', 'expensive', 'cheap', 'value', 'money', 'affordable', 'budget', 'worth', 'कीमत', 'दाम'],
449
+ 'Service': ['service', 'support', 'help', 'staff', 'customer', 'response', 'team', 'representative', 'सेवा', 'सहायता'],
450
+ 'Delivery': ['delivery', 'shipping', 'fast', 'quick', 'slow', 'delayed', 'arrive', 'package', 'डिलीवरी', 'शिपिंग'],
451
+ 'Design': ['design', 'look', 'appearance', 'beautiful', 'ugly', 'style', 'color', 'aesthetic', 'डिज़ाइन', 'रूप'],
452
+ 'Performance': ['performance', 'speed', 'fast', 'slow', 'efficiency', 'works', 'function', 'smooth', 'प्रदर्शन'],
453
+ 'Usability': ['easy', 'difficult', 'user', 'interface', 'intuitive', 'complex', 'simple', 'confusing', 'उपयोग'],
454
+ 'Features': ['feature', 'function', 'capability', 'option', 'setting', 'mode', 'tool', 'फीचर', 'सुविधा'],
455
+ 'Size': ['size', 'big', 'small', 'large', 'compact', 'tiny', 'huge', 'dimension', 'आकार'],
456
+ 'Battery': ['battery', 'charge', 'power', 'energy', 'last', 'drain', 'life', 'बैटरी', 'चार्ज']
457
  }
458
 
459
  for aspect, keywords in aspect_keywords.items():