Spaces:

parthnuwal7
/

ABSA

Sleeping

App Files Files Community

parthnuwal7 commited on Oct 19, 2025

Commit

2b4639d

1 Parent(s): ac8959c

Integrated PyABSA again

Browse files

Files changed (2) hide show

requirements-docker.txt +6 -7
src/utils/data_processor.py +109 -38

requirements-docker.txt CHANGED Viewed

@@ -18,13 +18,12 @@ numpy>=1.24.0,<1.26.0
 scikit-learn>=1.3.0,<1.4.0
 langdetect>=1.0.9
-# ===== Optional: Full ML Stack (Uncomment if using local models) =====
-# torch>=2.0.0,<2.2.0
-# transformers>=4.30.0,<4.37.0
-# pyabsa>=2.4.0,<3.0.0
-# sentencepiece>=0.1.99
-# sacremoses>=0.0.53
-# faiss-cpu>=1.7.4
 # ===== Streamlit (for dual interface) =====
 streamlit>=1.28.0,<1.30.0

 scikit-learn>=1.3.0,<1.4.0
 langdetect>=1.0.9
+# ===== ML Stack for PyABSA =====
+torch>=2.0.0,<2.2.0
+transformers>=4.30.0,<4.37.0
+pyabsa>=2.4.0,<3.0.0
+sentencepiece>=0.1.99
+sacremoses>=0.0.53
 # ===== Streamlit (for dual interface) =====
 streamlit>=1.28.0,<1.30.0

src/utils/data_processor.py CHANGED Viewed

@@ -189,14 +189,28 @@ class TranslationService:
 class ABSAProcessor:
-    """Enhanced ABSA using Hugging Face Inference API - much more reliable for production."""
     def __init__(self):
-        self.api_token = self._get_hf_token()
-        self.sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
-        self.base_url = "https://api-inference.huggingface.co/models"
-        self.task_manager = None  # Will be set by DataProcessor
-        logger.info("Initialized HF Inference API for ABSA processing")
     def set_task_manager(self, task_manager):
         """Set task manager for cancellation support."""
@@ -249,10 +263,10 @@ class ABSAProcessor:
     def extract_aspects_and_sentiments(self, reviews: List[str], task_id: Optional[str] = None) -> List[Dict[str, Any]]:
         """
-        Extract aspects and sentiments using HF Inference API and rule-based aspects with cancellation support.
         Args:
-            reviews: List of review texts
             task_id: Optional task ID for cancellation tracking
         Returns:
@@ -260,7 +274,7 @@ class ABSAProcessor:
         """
         import gc
-        logger.info(f"Processing {len(reviews)} reviews with HF Inference API")
         processed_results = []
         batch_size = 5  # Process 5 reviews at a time for responsive cancellation
@@ -283,22 +297,17 @@ class ABSAProcessor:
                 if review_idx % 10 == 0:  # Progress logging
                     logger.info(f"Processing review {review_idx+1}/{len(reviews)}")
-                # Get sentiment from HF API
-                sentiment = self._get_hf_sentiment(review)
-                # Extract aspects using rule-based approach
-                aspects = self._extract_simple_aspects(review)
-                processed_result = {
-                    'sentence': review,
-                    'aspects': aspects,
-                    'sentiments': [sentiment] * len(aspects),
-                    'positions': [[0, len(review)]] * len(aspects),
-                    'confidence_scores': [0.8] * len(aspects),  # HF models are quite confident
-                    'tokens': review.split(),
-                    'iob_tags': ['O'] * len(review.split())
-                }
-                processed_results.append(processed_result)
             # Update progress after each batch (50-90% range)
             if task_id and self.task_manager:
@@ -308,6 +317,52 @@ class ABSAProcessor:
         logger.info(f"Successfully processed {len(processed_results)} reviews")
         return processed_results
     def _get_hf_sentiment(self, text: str) -> str:
         """Get sentiment from HF Inference API with fallback."""
         if not self.api_token:
@@ -344,21 +399,33 @@ class ABSAProcessor:
             return self._get_rule_based_sentiment(text)
     def _get_rule_based_sentiment(self, review: str) -> str:
-        """Fallback rule-based sentiment analysis."""
         review_lower = review.lower()
         # Enhanced sentiment words
         positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'best', 'awesome',
                          'fantastic', 'wonderful', 'perfect', 'satisfied', 'happy', 'pleased',
-                         'outstanding', 'brilliant', 'superb', 'delighted', 'impressed']
         negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'horrible', 'poor',
                          'disappointing', 'frustrated', 'angry', 'broken', 'failed', 'useless',
-                         'pathetic', 'disgusting', 'annoying', 'waste', 'regret']
         pos_count = sum(1 for word in positive_words if word in review_lower)
         neg_count = sum(1 for word in negative_words if word in review_lower)
         if pos_count > neg_count:
             return 'Positive'
         elif neg_count > pos_count:
@@ -371,18 +438,22 @@ class ABSAProcessor:
         review_lower = review.lower()
         aspects = []
-        # Enhanced aspect keywords
         aspect_keywords = {
-            'Quality': ['quality', 'build', 'material', 'construction', 'durability', 'solid', 'sturdy', 'cheap', 'flimsy'],
-            'Price': ['price', 'cost', 'expensive', 'cheap', 'value', 'money', 'affordable', 'budget', 'worth'],
-            'Service': ['service', 'support', 'help', 'staff', 'customer', 'response', 'team', 'representative'],
-            'Delivery': ['delivery', 'shipping', 'fast', 'quick', 'slow', 'delayed', 'arrive', 'package'],
-            'Design': ['design', 'look', 'appearance', 'beautiful', 'ugly', 'style', 'color', 'aesthetic'],
-            'Performance': ['performance', 'speed', 'fast', 'slow', 'efficiency', 'works', 'function', 'smooth'],
-            'Usability': ['easy', 'difficult', 'user', 'interface', 'intuitive', 'complex', 'simple', 'confusing'],
-            'Features': ['feature', 'function', 'capability', 'option', 'setting', 'mode', 'tool'],
-            'Size': ['size', 'big', 'small', 'large', 'compact', 'tiny', 'huge', 'dimension'],
-            'Battery': ['battery', 'charge', 'power', 'energy', 'last', 'drain', 'life']
         }
         for aspect, keywords in aspect_keywords.items():

 class ABSAProcessor:
+    """Enhanced ABSA using PyABSA for accurate aspect extraction and sentiment analysis."""
     def __init__(self):
+        self.model = None
+        self.task_manager = None
+        self._load_pyabsa_model()
+        logger.info("Initialized PyABSA for ABSA processing")
+    def _load_pyabsa_model(self):
+        """Load PyABSA multilingual model with caching."""
+        try:
+            import pyabsa
+            from pyabsa import ATEPCCheckpointManager
+            # Use multilingual checkpoint - works for English and some Hindi
+            checkpoint = ATEPCCheckpointManager.get_checkpoint('multilingual')
+            self.model = pyabsa.load_aspect_extractor(checkpoint=checkpoint)
+            logger.info("PyABSA model loaded successfully")
+        except Exception as e:
+            logger.warning(f"Failed to load PyABSA model: {str(e)}. Using fallback.")
+            self.model = None
     def set_task_manager(self, task_manager):
         """Set task manager for cancellation support."""
     def extract_aspects_and_sentiments(self, reviews: List[str], task_id: Optional[str] = None) -> List[Dict[str, Any]]:
         """
+        Extract aspects and sentiments using PyABSA with fallback and cancellation support.
         Args:
+            reviews: List of review texts (preferably in English after translation)
             task_id: Optional task ID for cancellation tracking
         Returns:
         """
         import gc
+        logger.info(f"Processing {len(reviews)} reviews with PyABSA")
         processed_results = []
         batch_size = 5  # Process 5 reviews at a time for responsive cancellation
                 if review_idx % 10 == 0:  # Progress logging
                     logger.info(f"Processing review {review_idx+1}/{len(reviews)}")
+                # Try PyABSA first, fallback to rule-based if unavailable
+                if self.model is not None:
+                    try:
+                        result = self._extract_with_pyabsa(review)
+                    except Exception as e:
+                        logger.warning(f"PyABSA failed for review {review_idx}: {str(e)}, using fallback")
+                        result = self._extract_with_fallback(review)
+                else:
+                    result = self._extract_with_fallback(review)
+                processed_results.append(result)
             # Update progress after each batch (50-90% range)
             if task_id and self.task_manager:
         logger.info(f"Successfully processed {len(processed_results)} reviews")
         return processed_results
+    def _extract_with_pyabsa(self, review: str) -> Dict[str, Any]:
+        """Extract aspects and sentiments using PyABSA model."""
+        result = self.model.predict(review, print_result=False, save_result=False)
+        # PyABSA returns: aspect, sentiment, confidence, position
+        aspects = result.get('aspect', [])
+        sentiments = result.get('sentiment', [])
+        positions = result.get('position', [])
+        confidence_scores = result.get('confidence', [])
+        # Handle single aspect case
+        if not isinstance(aspects, list):
+            aspects = [aspects] if aspects else []
+            sentiments = [sentiments] if sentiments else []
+            positions = [positions] if positions else []
+            confidence_scores = [confidence_scores] if confidence_scores else []
+        # If no aspects found, use fallback
+        if not aspects:
+            return self._extract_with_fallback(review)
+        return {
+            'sentence': review,
+            'aspects': aspects,
+            'sentiments': sentiments,
+            'positions': positions,
+            'confidence_scores': confidence_scores,
+            'tokens': review.split(),
+            'iob_tags': ['O'] * len(review.split())
+        }
+    def _extract_with_fallback(self, review: str) -> Dict[str, Any]:
+        """Fallback rule-based extraction when PyABSA is unavailable."""
+        sentiment = self._get_rule_based_sentiment(review)
+        aspects = self._extract_simple_aspects(review)
+        return {
+            'sentence': review,
+            'aspects': aspects,
+            'sentiments': [sentiment] * len(aspects),
+            'positions': [[0, len(review)]] * len(aspects),
+            'confidence_scores': [0.7] * len(aspects),  # Lower confidence for rule-based
+            'tokens': review.split(),
+            'iob_tags': ['O'] * len(review.split())
+        }
     def _get_hf_sentiment(self, text: str) -> str:
         """Get sentiment from HF Inference API with fallback."""
         if not self.api_token:
             return self._get_rule_based_sentiment(text)
     def _get_rule_based_sentiment(self, review: str) -> str:
+        """Fallback rule-based sentiment analysis with enhanced negative detection."""
         review_lower = review.lower()
         # Enhanced sentiment words
         positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'best', 'awesome',
                          'fantastic', 'wonderful', 'perfect', 'satisfied', 'happy', 'pleased',
+                         'outstanding', 'brilliant', 'superb', 'delighted', 'impressed', 'working',
+                         'अच्छा', 'बढ़िया', 'शानदार', 'बेहतरीन']
         negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'horrible', 'poor',
                          'disappointing', 'frustrated', 'angry', 'broken', 'failed', 'useless',
+                         'pathetic', 'disgusting', 'annoying', 'waste', 'regret', 'problem', 'issue',
+                         'not working', 'doesn\'t work', 'never', 'delayed', 'late', 'slow', 'error',
+                         'खराब', 'समस्या', 'देर', 'नहीं', 'बुरा']
+        # Strong negative phrases (count as 2 points)
+        negative_phrases = ['too late', 'never comes', 'not received', 'doesn\'t arrive',
+                           'delayed', 'not working', 'बहुत देर', 'नहीं आता']
         pos_count = sum(1 for word in positive_words if word in review_lower)
         neg_count = sum(1 for word in negative_words if word in review_lower)
+        # Check for negative phrases (stronger signal)
+        for phrase in negative_phrases:
+            if phrase in review_lower:
+                neg_count += 2
         if pos_count > neg_count:
             return 'Positive'
         elif neg_count > pos_count:
         review_lower = review.lower()
         aspects = []
+        # Enhanced aspect keywords with Hindi/English variants
         aspect_keywords = {
+            'OTP/Verification': ['otp', 'atp', 'verification', 'verify', 'code', 'pin', 'authentication', 'ओटीपी', 'कोड', 'सत्यापन'],
+            'Login/Account': ['login', 'sign in', 'signin', 'account', 'password', 'username', 'register', 'signup', 'लॉगिन', 'खाता'],
+            'App Performance': ['app', 'application', 'crash', 'freeze', 'hang', 'loading', 'lag', 'slow', 'एप', 'एप्लिकेशन'],
+            'Payment': ['payment', 'pay', 'transaction', 'refund', 'money', 'bank', 'upi', 'wallet', 'भुगतान', 'पैसा'],
+            'Quality': ['quality', 'build', 'material', 'construction', 'durability', 'solid', 'sturdy', 'cheap', 'flimsy', 'गुणवत्ता'],
+            'Price': ['price', 'cost', 'expensive', 'cheap', 'value', 'money', 'affordable', 'budget', 'worth', 'कीमत', 'दाम'],
+            'Service': ['service', 'support', 'help', 'staff', 'customer', 'response', 'team', 'representative', 'सेवा', 'सहायता'],
+            'Delivery': ['delivery', 'shipping', 'fast', 'quick', 'slow', 'delayed', 'arrive', 'package', 'डिलीवरी', 'शिपिंग'],
+            'Design': ['design', 'look', 'appearance', 'beautiful', 'ugly', 'style', 'color', 'aesthetic', 'डिज़ाइन', 'रूप'],
+            'Performance': ['performance', 'speed', 'fast', 'slow', 'efficiency', 'works', 'function', 'smooth', 'प्रदर्शन'],
+            'Usability': ['easy', 'difficult', 'user', 'interface', 'intuitive', 'complex', 'simple', 'confusing', 'उपयोग'],
+            'Features': ['feature', 'function', 'capability', 'option', 'setting', 'mode', 'tool', 'फीचर', 'सुविधा'],
+            'Size': ['size', 'big', 'small', 'large', 'compact', 'tiny', 'huge', 'dimension', 'आकार'],
+            'Battery': ['battery', 'charge', 'power', 'energy', 'last', 'drain', 'life', 'बैटरी', 'चार्ज']
         }
         for aspect, keywords in aspect_keywords.items():