Spaces:

parthnuwal7
/

ABSA

Sleeping

parthnuwal7 commited on Sep 30, 2025

Commit

1b4a2d1

1 Parent(s): f71c767

ADD: API architecture files for split deployment

Added app_api.py - HF Spaces API entry point
Added backend_api.py - Dedicated ML API service
Added frontend_light.py - Streamlit Cloud frontend
Added requirements-backend.txt - ML dependencies
Added requirements-frontend.txt - Lightweight UI only

Architecture: HF Spaces (PyABSA backend) + Streamlit Cloud (frontend)
Next: Test HF Spaces deployment and configure API endpoints

Files changed (5) hide show

app_api.py +226 -0
backend_api.py +189 -0
frontend_light.py +266 -0
requirements-backend.txt +28 -0
requirements-frontend.txt +17 -0

app_api.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""
+HF Spaces API Entry Point - Modified app_enhanced.py for API mode
+Provides both Streamlit UI and API endpoints for backend processing
+"""
+import streamlit as st
+import pandas as pd
+import sys
+import os
+import json
+from typing import Dict, Any
+# Setup cache directories for Docker environment
+def setup_cache_directories():
+    """Setup cache directories with proper permissions for containerized environment."""
+    cache_dirs = [
+        os.path.expanduser("~/.cache"),
+        os.path.expanduser("~/.cache/huggingface"),
+        os.path.expanduser("~/.cache/huggingface/transformers"),
+        "/.cache",
+        "/.cache/huggingface",
+        "/.cache/huggingface/transformers"
+    ]
+    for cache_dir in cache_dirs:
+        try:
+            os.makedirs(cache_dir, exist_ok=True)
+            test_file = os.path.join(cache_dir, "test_write.tmp")
+            with open(test_file, 'w') as f:
+                f.write("test")
+            os.remove(test_file)
+        except (PermissionError, OSError):
+            continue
+# Initialize cache directories
+setup_cache_directories()
+# Add src to path for imports
+current_dir = os.path.dirname(os.path.abspath(__file__))
+src_path = os.path.join(current_dir, 'src')
+if src_path not in sys.path:
+    sys.path.insert(0, src_path)
+from utils.data_processor import DataProcessor
+from components.visualizations import VisualizationEngine
+# Page configuration
+st.set_page_config(
+    page_title="🤖 ABSA ML Backend API",
+    page_icon="🤖",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Check if running in API mode (query parameter)
+query_params = st.experimental_get_query_params()
+api_mode = query_params.get('api', [False])[0]
+# Initialize processor (cached for performance)
+@st.cache_resource
+def get_data_processor():
+    """Initialize data processor with models cached."""
+    return DataProcessor()
+@st.cache_resource
+def get_visualization_engine():
+    """Initialize visualization engine."""
+    return VisualizationEngine()
+def serialize_for_api(results: Dict) -> Dict:
+    """Convert complex objects to JSON-serializable format for API responses."""
+    serialized = {}
+    for key, value in results.items():
+        if key == 'processed_data':
+            # Convert DataFrame to dict
+            serialized[key] = value.to_dict('records') if hasattr(value, 'to_dict') else value
+        elif key == 'aspect_network':
+            # Convert NetworkX graph to dict
+            import networkx as nx
+            if hasattr(value, 'nodes'):
+                serialized[key] = nx.node_link_data(value)
+            else:
+                serialized[key] = value
+        elif hasattr(value, 'to_dict'):
+            # Convert DataFrames
+            serialized[key] = value.to_dict('records')
+        elif isinstance(value, pd.DataFrame):
+            serialized[key] = value.to_dict('records')
+        else:
+            # Keep as is for basic types
+            serialized[key] = value
+    return serialized
+# API Mode - Process reviews via URL parameters
+if api_mode:
+    st.title("🤖 ABSA Processing API")
+    st.write("Backend processing endpoint - send POST requests with review data")
+    # Show API documentation
+    with st.expander("📚 API Documentation", expanded=True):
+        st.markdown("""
+        ### POST /api/process
+        **Request Format:**
+        ```json
+        {
+            "data": [
+                {
+                    "id": 1,
+                    "reviews_title": "Product Review",
+                    "review": "This product is amazing!",
+                    "date": "2025-10-01",
+                    "user_id": "user123"
+                }
+            ],
+            "options": {
+                "enable_translation": true,
+                "enable_absa": true
+            }
+        }
+        ```
+        **Response Format:**
+        ```json
+        {
+            "status": "success",
+            "data": {
+                "processed_data": [...],
+                "absa_details": [...],
+                "summary": {...}
+            }
+        }
+        ```
+        """)
+    # Test endpoint
+    st.subheader("🧪 Test Processing")
+    if st.button("Test with Sample Data"):
+        sample_data = [
+            {
+                'id': 1,
+                'reviews_title': 'Great Product',
+                'review': 'यह उत्पाद बहुत अच्छा है। गुणवत्ता उत्कृष्ट है।',
+                'date': '2025-10-01',
+                'user_id': 'user1'
+            },
+            {
+                'id': 2,
+                'reviews_title': 'Poor Service',
+                'review': 'The delivery was very slow and customer service was terrible.',
+                'date': '2025-09-30',
+                'user_id': 'user2'
+            }
+        ]
+        try:
+            with st.spinner("Processing with PyABSA & M2M100..."):
+                processor = get_data_processor()
+                df = pd.DataFrame(sample_data)
+                results = processor.process_uploaded_data(df)
+            if 'error' not in results:
+                st.success("✅ Processing successful!")
+                # Show API response format
+                api_response = {
+                    "status": "success",
+                    "data": serialize_for_api(results)
+                }
+                # Summary metrics
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.metric("Reviews Processed", len(results.get('processed_data', [])))
+                with col2:
+                    st.metric("Languages Detected", len(results.get('summary', {}).get('languages_detected', [])))
+                with col3:
+                    st.metric("Aspects Found", len(results.get('areas_of_improvement', [])) + len(results.get('strength_anchors', [])))
+                # Full API response
+                with st.expander("🔍 Full API Response"):
+                    st.json(api_response)
+            else:
+                st.error(f"❌ Processing failed: {results.get('error', 'Unknown error')}")
+        except Exception as e:
+            st.error(f"❌ Error: {str(e)}")
+    # Health status
+    st.subheader("💚 System Health")
+    processor = get_data_processor()
+    health = {
+        "translation_service": "available" if hasattr(processor.translator, 'model') else "unavailable",
+        "absa_service": "available" if hasattr(processor.absa_processor, 'aspect_extractor') else "unavailable",
+        "timestamp": pd.Timestamp.now().isoformat()
+    }
+    if all(status == "available" for status in [health["translation_service"], health["absa_service"]]):
+        st.success("🟢 All services operational")
+    else:
+        st.warning("🟡 Some services may be initializing...")
+    st.json(health)
+else:
+    # Regular Streamlit UI Mode - Your existing app_enhanced.py content
+    st.title("📊 Advanced Sentiment Analysis Dashboard")
+    st.subheader("🎯 Multi-dimensional Review Analytics with ABSA")
+    # Your existing app content here...
+    st.info("💡 This is the full dashboard interface. Add ?api=true to URL for API mode.")
+    # Add your existing app_enhanced.py content here
+    # ... (rest of your dashboard code)
+# Footer
+st.markdown("---")
+st.markdown("""
+**🤖 Backend Mode:** High-performance PyABSA + M2M100 processing
+**🌐 API Endpoint:** Add `?api=true` to URL for API documentation
+**⚡ Performance:** Optimized for HF Spaces deployment
+""")

backend_api.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+ML Backend API for HF Spaces deployment
+Provides PyABSA and M2M100 services via REST API
+"""
+import streamlit as st
+import pandas as pd
+import json
+from typing import Dict, List, Any
+from src.utils.data_processor import DataProcessor
+import logging
+# Configure for API mode
+st.set_page_config(
+    page_title="ABSA ML Backend API",
+    page_icon="🤖",
+    layout="wide"
+)
+logger = logging.getLogger(__name__)
+# Initialize processor (cached for performance)
+@st.cache_resource
+def get_data_processor():
+    """Initialize data processor with models cached."""
+    return DataProcessor()
+def process_reviews_api(reviews_data: Dict) -> Dict:
+    """
+    Process reviews via API endpoint.
+    Args:
+        reviews_data: Dictionary with reviews and options
+    Returns:
+        Processed results dictionary
+    """
+    try:
+        processor = get_data_processor()
+        # Create DataFrame from API input
+        df = pd.DataFrame(reviews_data['data'])
+        # Process data
+        results = processor.process_uploaded_data(df)
+        # Convert to JSON-serializable format
+        serialized_results = serialize_results(results)
+        return {
+            'status': 'success',
+            'data': serialized_results
+        }
+    except Exception as e:
+        logger.error(f"API processing error: {str(e)}")
+        return {
+            'status': 'error',
+            'message': str(e)
+        }
+def serialize_results(results: Dict) -> Dict:
+    """Convert complex objects to JSON-serializable format."""
+    serialized = {}
+    for key, value in results.items():
+        if key == 'processed_data':
+            # Convert DataFrame to dict
+            serialized[key] = value.to_dict('records')
+        elif key == 'aspect_network':
+            # Convert NetworkX graph to dict
+            import networkx as nx
+            serialized[key] = nx.node_link_data(value)
+        elif hasattr(value, 'to_dict'):
+            # Convert DataFrames
+            serialized[key] = value.to_dict('records')
+        else:
+            # Keep as is
+            serialized[key] = value
+    return serialized
+# Streamlit Interface for API Testing
+st.title("🤖 ABSA ML Backend API")
+st.subheader("High-Performance PyABSA & M2M100 Processing")
+# API Documentation
+with st.expander("📚 API Documentation", expanded=True):
+    st.markdown("""
+    ### Endpoints
+    **POST** `/process-reviews`
+    ```json
+    {
+        "data": [
+            {
+                "id": 1,
+                "reviews_title": "Product Review",
+                "review": "This product is amazing!",
+                "date": "2025-09-30",
+                "user_id": "user123"
+            }
+        ]
+    }
+    ```
+    ### Response
+    ```json
+    {
+        "status": "success",
+        "data": {
+            "processed_data": [...],
+            "absa_details": [...],
+            "analytics": {...}
+        }
+    }
+    ```
+    """)
+# Test Interface
+st.subheader("🧪 Test ML Processing")
+# Sample data for testing
+if st.button("Test with Sample Data"):
+    sample_data = {
+        'data': [
+            {
+                'id': 1,
+                'reviews_title': 'Great Product',
+                'review': 'यह उत्पाद बहुत अच्छा है। गुणवत्ता उत्कृष्ट है।',
+                'date': '2025-09-30',
+                'user_id': 'user1'
+            },
+            {
+                'id': 2,
+                'reviews_title': 'Poor Service',
+                'review': 'The delivery was very slow and customer service was terrible.',
+                'date': '2025-09-29',
+                'user_id': 'user2'
+            }
+        ]
+    }
+    with st.spinner("Processing with ML models..."):
+        results = process_reviews_api(sample_data)
+    if results['status'] == 'success':
+        st.success("✅ Processing successful!")
+        # Show summary
+        data = results['data']
+        st.json({
+            'total_reviews': len(data.get('processed_data', [])),
+            'languages_detected': data.get('summary', {}).get('languages_detected', []),
+            'sentiment_distribution': data.get('summary', {}).get('sentiment_distribution', {}),
+            'top_aspects_found': len(data.get('areas_of_improvement', [])) + len(data.get('strength_anchors', []))
+        })
+        # Full results in expander
+        with st.expander("🔍 Full Results"):
+            st.json(results)
+    else:
+        st.error(f"❌ Processing failed: {results.get('message', 'Unknown error')}")
+# Performance Metrics
+st.subheader("📊 Backend Performance")
+col1, col2, col3 = st.columns(3)
+with col1:
+    st.metric("Models Loaded", "2", help="PyABSA + M2M100")
+with col2:
+    st.metric("Memory Usage", "~2GB", help="Estimated RAM usage")
+with col3:
+    st.metric("Processing Speed", "~10s", help="Per 100 reviews")
+# Health Check
+st.subheader("💚 Health Status")
+processor = get_data_processor()
+health_status = {}
+health_status['translation_ready'] = processor.translator.model is not None
+health_status['absa_ready'] = hasattr(processor.absa_processor, 'api_token')
+health_status['overall'] = all(health_status.values())
+if health_status['overall']:
+    st.success("🟢 All systems operational")
+else:
+    st.error("🔴 Some services unavailable")
+st.json(health_status)

frontend_light.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+Frontend-only Streamlit app for Streamlit Cloud deployment
+Connects to HF Spaces ML backend via API calls
+"""
+import streamlit as st
+import pandas as pd
+import requests
+import json
+import plotly.express as px
+import plotly.graph_objects as go
+from typing import Dict, Any
+import time
+# Lightweight requirements - no ML dependencies!
+st.set_page_config(
+    page_title="📊 Advanced Sentiment Analytics",
+    page_icon="📊",
+    layout="wide"
+)
+# Configuration
+HF_SPACES_API_URL = "https://your-username-absa-backend.hf.space"  # Update with your HF Space URL
+def call_ml_backend(data: Dict) -> Dict:
+    """Call the ML backend API on HF Spaces."""
+    try:
+        response = requests.post(
+            f"{HF_SPACES_API_URL}/process-reviews",
+            json=data,
+            timeout=120  # Allow time for ML processing
+        )
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.Timeout:
+        return {"status": "error", "message": "Backend processing timeout (>2 minutes)"}
+    except requests.exceptions.ConnectionError:
+        return {"status": "error", "message": "Cannot connect to ML backend"}
+    except Exception as e:
+        return {"status": "error", "message": f"API error: {str(e)}"}
+def create_lightweight_visualizations(processed_data: Dict):
+    """Create visualizations from processed data (no ML dependencies)."""
+    # Sentiment Distribution
+    if 'sentiment_distribution' in processed_data.get('summary', {}):
+        sentiment_dist = processed_data['summary']['sentiment_distribution']
+        fig = px.pie(
+            values=list(sentiment_dist.values()),
+            names=list(sentiment_dist.keys()),
+            title="Overall Sentiment Distribution"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+    # Intent Analysis
+    if 'intents_distribution' in processed_data.get('summary', {}):
+        intents_dist = processed_data['summary']['intents_distribution']
+        fig = px.bar(
+            x=list(intents_dist.keys()),
+            y=list(intents_dist.values()),
+            title="Intent Classification Results"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+    # Areas of Improvement
+    if processed_data.get('areas_of_improvement'):
+        improvements_df = pd.DataFrame(processed_data['areas_of_improvement'])
+        if not improvements_df.empty:
+            fig = px.bar(
+                improvements_df.head(10),
+                x='priority_score',
+                y='aspect',
+                orientation='h',
+                title="Top Areas for Improvement",
+                color='negativity_pct',
+                color_continuous_scale='Reds'
+            )
+            st.plotly_chart(fig, use_container_width=True)
+# Main App Interface
+st.title("📊 Advanced Sentiment Analytics Dashboard")
+st.subheader("Powered by PyABSA & M2M100 (Backend: HF Spaces)")
+# Sidebar for configuration
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    # Backend status check
+    st.subheader("🔗 Backend Status")
+    if st.button("Check ML Backend"):
+        with st.spinner("Checking backend..."):
+            try:
+                response = requests.get(f"{HF_SPACES_API_URL}/health", timeout=10)
+                if response.status_code == 200:
+                    st.success("🟢 Backend Online")
+                else:
+                    st.error("🔴 Backend Issues")
+            except:
+                st.error("🔴 Backend Offline")
+    # Processing options
+    st.subheader("🎛️ Processing Options")
+    enable_translation = st.checkbox("Enable Translation", value=True)
+    enable_absa = st.checkbox("Enable ABSA", value=True)
+    # Cost info
+    st.info("""
+    💡 **Architecture Benefits:**
+    - Frontend: Free Streamlit Cloud
+    - Backend: HF Spaces (pay-per-use)
+    - No local ML dependencies
+    - Automatic scaling
+    """)
+# File Upload
+st.header("📁 Upload Reviews Data")
+uploaded_file = st.file_uploader(
+    "Choose CSV file",
+    type="csv",
+    help="Upload CSV with columns: id, reviews_title, review, date, user_id"
+)
+if uploaded_file is not None:
+    # Read and validate data
+    try:
+        df = pd.read_csv(uploaded_file)
+        st.success(f"✅ Uploaded {len(df)} reviews")
+        # Show preview
+        st.subheader("📋 Data Preview")
+        st.dataframe(df.head(), use_container_width=True)
+        # Validation
+        required_cols = ['id', 'reviews_title', 'review', 'date', 'user_id']
+        missing_cols = [col for col in required_cols if col not in df.columns]
+        if missing_cols:
+            st.error(f"❌ Missing required columns: {missing_cols}")
+        else:
+            st.success("✅ Data format validated")
+            # Process button
+            if st.button("🚀 Process with ML Backend", type="primary"):
+                # Prepare API payload
+                api_data = {
+                    "data": df.to_dict('records'),
+                    "options": {
+                        "enable_translation": enable_translation,
+                        "enable_absa": enable_absa
+                    }
+                }
+                # Call backend
+                with st.spinner("🤖 Processing with PyABSA & M2M100 models..."):
+                    progress_bar = st.progress(0)
+                    # Simulate progress (since we can't track actual backend progress)
+                    for i in range(10):
+                        time.sleep(0.5)
+                        progress_bar.progress((i + 1) / 10)
+                    results = call_ml_backend(api_data)
+                # Handle results
+                if results.get("status") == "success":
+                    st.success("✅ Processing completed successfully!")
+                    # Display results
+                    processed_data = results["data"]
+                    # Summary metrics
+                    st.header("📊 Analysis Summary")
+                    col1, col2, col3, col4 = st.columns(4)
+                    summary = processed_data.get('summary', {})
+                    with col1:
+                        st.metric(
+                            "Total Reviews",
+                            summary.get('total_reviews', 0)
+                        )
+                    with col2:
+                        languages = summary.get('languages_detected', [])
+                        st.metric(
+                            "Languages Detected",
+                            len(languages)
+                        )
+                    with col3:
+                        improvements = len(processed_data.get('areas_of_improvement', []))
+                        st.metric(
+                            "Problem Areas",
+                            improvements
+                        )
+                    with col4:
+                        strengths = len(processed_data.get('strength_anchors', []))
+                        st.metric(
+                            "Strength Areas",
+                            strengths
+                        )
+                    # Visualizations
+                    st.header("📈 Analytics Dashboard")
+                    create_lightweight_visualizations(processed_data)
+                    # Detailed results
+                    with st.expander("🔍 Detailed Results"):
+                        st.json(processed_data)
+                    # Download options
+                    st.header("💾 Export Results")
+                    # Convert to downloadable format
+                    if processed_data.get('processed_data'):
+                        result_df = pd.DataFrame(processed_data['processed_data'])
+                        csv = result_df.to_csv(index=False)
+                        st.download_button(
+                            label="📥 Download Processed Data (CSV)",
+                            data=csv,
+                            file_name=f"sentiment_analysis_{time.strftime('%Y%m%d_%H%M%S')}.csv",
+                            mime="text/csv"
+                        )
+                else:
+                    st.error(f"❌ Processing failed: {results.get('message', 'Unknown error')}")
+                    # Troubleshooting info
+                    with st.expander("🔧 Troubleshooting"):
+                        st.write("**Common Issues:**")
+                        st.write("- Backend may be starting up (first use takes 2-3 minutes)")
+                        st.write("- Large datasets may timeout (try smaller batches)")
+                        st.write("- Check backend status in sidebar")
+                        st.write("**Backend URL:**", HF_SPACES_API_URL)
+                        st.json(results)
+    except Exception as e:
+        st.error(f"❌ Error reading file: {str(e)}")
+else:
+    # Show sample data format
+    st.info("👆 Upload a CSV file to get started")
+    st.subheader("📝 Expected CSV Format")
+    sample_df = pd.DataFrame({
+        'id': [1, 2, 3],
+        'reviews_title': ['Great Product', 'Poor Service', 'Average Quality'],
+        'review': [
+            'यह उत्पाद बहुत अच्छा है',
+            'The delivery was very slow',
+            'Product is okay, nothing special'
+        ],
+        'date': ['2025-09-30', '2025-09-29', '2025-09-28'],
+        'user_id': ['user1', 'user2', 'user3']
+    })
+    st.dataframe(sample_df, use_container_width=True)
+# Footer
+st.markdown("---")
+st.markdown("""
+**Architecture:** Frontend (Streamlit Cloud) ↔ ML Backend (HF Spaces)
+**Models:** PyABSA (Multilingual) + Facebook M2M100 (418M)
+**Benefits:** Free frontend, scalable ML processing, high accuracy
+""")

requirements-backend.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+# Backend Requirements - HF Spaces with PyABSA
+# Full ML stack for high-accuracy processing
+# Core ML frameworks
+torch>=2.0.0,<2.2.0
+transformers>=4.30.0,<4.37.0
+pyabsa>=2.4.0,<3.0.0
+sentencepiece>=0.1.99
+sacremoses>=0.0.53
+faiss-cpu>=1.7.4
+# Data processing
+pandas>=1.5.0,<2.1.0
+numpy>=1.24.0,<1.26.0
+scikit-learn>=1.3.0,<1.4.0
+langdetect>=1.0.9
+# API and web framework
+streamlit>=1.28.0,<1.30.0
+requests>=2.31.0
+# Utilities
+joblib>=1.3.0
+tqdm>=4.65.0
+pillow>=10.0.0,<10.2.0
+# Optional for network analysis (if needed)
+networkx>=3.0

requirements-frontend.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# Frontend Requirements - Streamlit Cloud Compatible
+# Only UI and API libraries - NO ML dependencies
+streamlit>=1.28.0
+pandas>=1.5.0
+numpy>=1.24.0
+plotly>=5.15.0
+requests>=2.31.0
+streamlit-option-menu>=0.3.6
+streamlit-aggrid>=0.3.4
+# Optional extras for enhanced UI
+pillow>=10.0.0
+openpyxl>=3.1.0
+# Total size: ~50MB (vs 1.5GB with ML libraries)
+# Perfect for Streamlit Cloud free tier!