Spaces:

WORKWITHSHAFISK
/

segmentopulse-backend

Paused

App Files Files Community

SHAFI commited on Feb 4

Commit

813a09f

1 Parent(s): ec623dd

fix: Integration fixes for Bloom Filter, ID mismatch, and validation crashes

Browse files

Files changed (5) hide show

app/routes/admin.py +217 -0
app/services/appwrite_db.py +22 -13
app/utils/data_validation.py +3 -2
app/utils/id_generator.py +2 -0
verify_integration_fixes.py +250 -0

app/routes/admin.py CHANGED Viewed

@@ -545,3 +545,220 @@ async def preview_newsletter_content(preference: str):
             detail=f"Failed to preview content: {str(e)}"
         )

             detail=f"Failed to preview content: {str(e)}"
         )
+# ===========================================
+# Bloom Filter Management (Integration Fix #1)
+# ===========================================
+@router.post("/bloom-filter/reset")
+async def reset_bloom_filter():
+    """
+    Reset Scalable Bloom Filter - Integration Sync Mechanism
+    **USE CASE**: After clearing the Appwrite database, call this endpoint
+    to reset the Bloom Filter to match the empty database state.
+    **WHY THIS IS NEEDED**:
+    In production (Hugging Face Spaces), the Bloom Filter persists on disk
+    even when the database is cleared via the Appwrite dashboard. This causes
+    100% duplicate detection because the filter "remembers" old URLs that no
+    longer exist in the database.
+    **INTEGRATION CONTRACT**:
+    - When you clear Appwrite DB → Call this endpoint
+    - Filter state syncs with database state
+    - Fresh ingestion can proceed
+    Returns:
+        Status and statistics before/after reset
+    """
+    try:
+        from app.services.deduplication import get_url_filter
+        # Get the global filter instance
+        url_filter = get_url_filter()
+        # Capture stats before reset
+        stats_before = url_filter.get_stats()
+        # Reset the filter
+        url_filter.reset()
+        # Capture stats after reset
+        stats_after = url_filter.get_stats()
+        return {
+            "success": True,
+            "message": "Scalable Bloom Filter reset successfully",
+            "operation": "Integration sync - Filter state now matches empty database",
+            "stats_before_reset": {
+                "total_checks": stats_before['total_checks'],
+                "unique_urls_added": stats_before['unique_urls_added'],
+                "duplicates_detected": stats_before['duplicates_detected'],
+                "filter_buckets": stats_before['filter_buckets'],
+                "estimated_capacity": stats_before['estimated_current_capacity']
+            },
+            "stats_after_reset": {
+                "filter_buckets": stats_after['filter_buckets'],
+                "last_reset": stats_after['last_reset']
+            },
+            "note": "Filter is now ready for fresh ingestion. Next fetch will save all articles."
+        }
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to reset Bloom Filter: {str(e)}"
+        )
+@router.get("/bloom-filter/stats")
+async def get_bloom_filter_stats():
+    """
+    Get Scalable Bloom Filter statistics - Observability Endpoint
+    Shows:
+    - Total URLs processed
+    - Duplicate detection rate
+    - Filter bucket count (auto-scaling metric)
+    - Memory usage estimate
+    - Last persistence time
+    **PRODUCTION DIAGNOSTIC**: Use this to verify filter state
+    and detect saturation issues.
+    Returns:
+        Comprehensive filter statistics
+    """
+    try:
+        from app.services.deduplication import get_url_filter
+        url_filter = get_url_filter()
+        stats = url_filter.get_stats()
+        # Calculate additional metrics
+        memory_usage = url_filter.get_estimated_memory_usage()
+        return {
+            "success": True,
+            "filter_type": "ScalableBloomFilter (pybloom_live)",
+            "persistence_enabled": True,
+            "persistence_path": url_filter.persistence_path,
+            "statistics": {
+                "total_checks": stats['total_checks'],
+                "unique_urls_added": stats['unique_urls_added'],
+                "duplicates_detected": stats['duplicates_detected'],
+                "duplicate_rate_percent": stats['duplicate_rate_percent'],
+                "filter_buckets": stats['filter_buckets'],
+                "initial_capacity": stats['initial_capacity'],
+                "current_estimated_capacity": stats['estimated_current_capacity'],
+                "error_rate": stats['filter_error_rate'],
+                "is_scalable": stats['is_scalable'],
+                "last_reset": stats['last_reset'],
+                "last_save": stats['last_save']
+            },
+            "memory": {
+                "estimated_usage": memory_usage,
+                "note": "Scalable Bloom Filter auto-grows as needed"
+            },
+            "health": {
+                "status": "healthy" if stats['duplicate_rate_percent'] < 95 else "warning",
+                "warning": "100% duplicate rate detected - consider reset" if stats['duplicate_rate_percent'] >= 99.5 else None
+            }
+        }
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to get Bloom Filter stats: {str(e)}"
+        )
+@router.get("/bloom-filter/health")
+async def bloom_filter_health_check():
+    """
+    Quick health check for Bloom Filter - Production Monitoring
+    Returns:
+    - Status: healthy/warning/critical
+    - Reason for any issues
+    - Recommended action
+    **ALERTING**: Use this for automated monitoring.
+    A "critical" status means ingestion is likely broken.
+    """
+    try:
+        from app.services.deduplication import get_url_filter
+        from app.services.appwrite_db import get_appwrite_db
+        import os
+        url_filter = get_url_filter()
+        stats = url_filter.get_stats()
+        appwrite_db = get_appwrite_db()
+        # Check 1: Duplicate rate health
+        duplicate_rate = stats['duplicate_rate_percent']
+        # Check 2: Filter persistence file exists
+        filter_file_exists = os.path.exists(url_filter.persistence_path)
+        # Check 3: Database initialized
+        db_initialized = appwrite_db.initialized
+        # Determine health status
+        issues = []
+        status = "healthy"
+        if duplicate_rate >= 99.5:
+            status = "critical"
+            issues.append({
+                "type": "duplicate_saturation",
+                "severity": "critical",
+                "details": f"Duplicate rate: {duplicate_rate}% (expected < 95%)",
+                "action": "Reset Bloom Filter via POST /admin/bloom-filter/reset"
+            })
+        elif duplicate_rate >= 90:
+            status = "warning"
+            issues.append({
+                "type": "high_duplicates",
+                "severity": "warning",
+                "details": f"Duplicate rate: {duplicate_rate}% (expected < 90%)",
+                "action": "Monitor ingestion logs for validation issues"
+            })
+        if not filter_file_exists:
+            issues.append({
+                "type": "missing_persistence",
+                "severity": "info",
+                "details": "Filter persistence file not found (filter will create on first save)",
+                "action": "No action needed - this is normal on first run"
+            })
+        if not db_initialized:
+            status = "critical"
+            issues.append({
+                "type": "database_disconnected",
+                "severity": "critical",
+                "details": "Appwrite database not initialized",
+                "action": "Check Appwrite credentials in environment variables"
+            })
+        return {
+            "status": status,
+            "timestamp": stats['last_reset'],
+            "checks_performed": {
+                "duplicate_rate": duplicate_rate,
+                "filter_persisted": filter_file_exists,
+                "database_initialized": db_initialized
+            },
+            "issues": issues if issues else [],
+            "recommendation": issues[0]["action"] if issues else "System healthy - all checks passed"
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e),
+            "recommendation": "Check server logs for detailed error information"
+        }

app/services/appwrite_db.py CHANGED Viewed

@@ -88,28 +88,37 @@ class AppwriteDatabase:
     def _generate_url_hash(self, url: str) -> str:
         """
-        Generate a unique hash for an article URL (with canonicalization)
-        Uses canonical URL normalization to catch duplicate stories:
-        - https://cnn.com/story?utm_source=twitter
-        - https://www.cnn.com/story?ref=homepage
-        Both map to same hash!
         Args:
-            url: Article URL
         Returns:
-            16-character hex hash
         """
-        from app.utils.url_canonicalization import canonicalize_url
         import hashlib
-        # Canonicalize URL first for better deduplication
-        canonical_url = canonicalize_url(url)
-        # Generate hash from canonical URL
-        hash_bytes = hashlib.sha256(canonical_url.encode('utf-8')).hexdigest()
-        return hash_bytes[:16]  # First 16 characters
     async def get_articles(self, category: str, limit: int = 20, offset: int = 0) -> List[Dict]:
         """

     def _generate_url_hash(self, url: str) -> str:
         """
+        Generate a unique hash for an article URL
+        **INTEGRATION FIX #2**: Updated to match frontend ID generation
+        Uses SHA-256 hash of the RAW URL (not canonicalized) to ensure
+        Frontend and Backend generate IDENTICAL IDs for the same article.
+        **WHY THE CHANGE**:
+        - OLD: 16-char hash of canonical URL (different from frontend)
+        - NEW: 32-char hash of raw URL (matches frontend exactly)
+        **NOTE**: Canonicalization is still used for Appwrite deduplication
+        via the unique constraint, but NOT for ID generation.
         Args:
+            url: Article URL (raw, not canonicalized)
         Returns:
+            32-character hex hash (Appwrite-compatible, frontend-compatible)
+        Example:
+            >>> _generate_url_hash("https://cnn.com/article?utm=123")
+            "a1b2c3d4e5f67890abcdef1234567890"  # 32 chars
         """
         import hashlib
+        # Generate SHA-256 hash from RAW URL (no canonicalization for ID)
+        hash_bytes = hashlib.sha256(url.encode('utf-8')).hexdigest()
+        # Return first 32 characters (matches frontend idGenerator.ts)
+        return hash_bytes[:32]
     async def get_articles(self, category: str, limit: int = 20, offset: int = 0) -> List[Dict]:
         """

app/utils/data_validation.py CHANGED Viewed

@@ -278,8 +278,9 @@ def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> b
         return True
     # Combine title and description for checking
-    title = article_dict.get('title', '').lower()
-    description = article_dict.get('description', '').lower()
     text = f"{title} {description}"
     # Count keyword matches

         return True
     # Combine title and description for checking
+    # FIX: Use (value or '') pattern to handle explicit None values from messy RSS feeds
+    title = (article_dict.get('title') or '').lower()
+    description = (article_dict.get('description') or '').lower()
     text = f"{title} {description}"
     # Count keyword matches

app/utils/id_generator.py CHANGED Viewed

@@ -17,6 +17,8 @@ def generate_article_id(url: str) -> str:
     """
     Generate Appwrite-compatible ID from URL
     Uses SHA-256 hash truncated to 32 characters to ensure
     it fits within Appwrite's 36-character document ID limit.

     """
     Generate Appwrite-compatible ID from URL
+    **INTEGRATION FIX #2**: Matches frontend and appwrite_db.py
     Uses SHA-256 hash truncated to 32 characters to ensure
     it fits within Appwrite's 36-character document ID limit.

verify_integration_fixes.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Integration Fix Verification Script
+====================================
+Tests all 3 integration fixes to ensure they're working correctly.
+Run this script to verify:
+1. Bloom Filter reset functionality
+2. ID generation consistency (32-char)
+3. Validation null-safety
+Usage:
+    python verify_integration_fixes.py
+"""
+import asyncio
+import hashlib
+def test_id_generation():
+    """
+    Test #2: Verify ID generation consistency
+    Frontend and Backend MUST generate identical IDs for the same URL
+    """
+    print("=" * 70)
+    print("🔍 TEST 1: ID Generation Consistency")
+    print("=" * 70)
+    test_urls = [
+        "https://cnn.com/article?utm_source=twitter",
+        "https://techcrunch.com/2024/ai-news",
+        "https://example.com/test"
+    ]
+    # Import backend ID generators
+    from app.services.appwrite_db import AppwriteDatabase
+    from app.utils.id_generator import generate_article_id as backend_gen
+    db = AppwriteDatabase()
+    all_match = True
+    for url in test_urls:
+        # Backend Method 1 (appwrite_db)
+        id_appwrite = db._generate_url_hash(url)
+        # Backend Method 2 (id_generator)
+        id_backend = backend_gen(url)
+        # Simulate frontend (same algo)
+        hash_obj = hashlib.sha256(url.encode('utf-8'))
+        id_frontend = hash_obj.hexdigest()[:32]
+        print(f"\nURL: {url[:50]}...")
+        print(f"  Frontend:  {id_frontend} (len={len(id_frontend)})")
+        print(f"  Backend1:  {id_appwrite} (len={len(id_appwrite)})")
+        print(f"  Backend2:  {id_backend} (len={len(id_backend)})")
+        # Verify all match
+        if id_frontend == id_appwrite == id_backend and len(id_frontend) == 32:
+            print("  ✅ PASS: All IDs match and are 32 chars")
+        else:
+            print("  ❌ FAIL: ID mismatch detected!")
+            all_match = False
+    print("\n" + "=" * 70)
+    if all_match:
+        print("✅ TEST 1 PASSED: ID generation is consistent across all components")
+    else:
+        print("❌ TEST 1 FAILED: ID generation mismatch detected")
+    print("=" * 70)
+    print()
+    return all_match
+def test_validation_null_safety():
+    """
+    Test #3: Verify null-safety in validation
+    Should handle articles with None fields gracefully
+    """
+    print("=" * 70)
+    print("🔍 TEST 2: Validation Null-Safety")
+    print("=" * 70)
+    from app.utils.data_validation import is_relevant_to_category
+    # Test articles with None fields (production edge case)
+    test_articles = [
+        {
+            "title": "AI News Update",
+            "description": None,  # Explicit None
+            "url": "https://test.com/1"
+        },
+        {
+            "title": None,  # Explicit None
+            "description": "Some content",
+            "url": "https://test.com/2"
+        },
+        {
+            "title": "Cloud Computing",
+            "description": "",  # Empty string
+            "url": "https://test.com/3"
+        }
+    ]
+    all_pass = True
+    for i, article in enumerate(test_articles, 1):
+        try:
+            result = is_relevant_to_category(article, "ai")
+            print(f"\n  Article {i}: ✅ No crash (relevant={result})")
+            print(f"    Title: {article.get('title')}")
+            print(f"    Description: {article.get('description')}")
+        except AttributeError as e:
+            print(f"\n  Article {i}: ❌ CRASH - {e}")
+            all_pass = False
+        except Exception as e:
+            print(f"\n  Article {i}: ⚠️  Other error - {e}")
+    print("\n" + "=" * 70)
+    if all_pass:
+        print("✅ TEST 2 PASSED: Validation handles None fields gracefully")
+    else:
+        print("❌ TEST 2 FAILED: Validation crashes on None fields")
+    print("=" * 70)
+    print()
+    return all_pass
+async def test_bloom_filter_reset():
+    """
+    Test #1: Verify Bloom Filter reset functionality
+    Should successfully reset and show before/after stats
+    """
+    print("=" * 70)
+    print("🔍 TEST 3: Bloom Filter Reset")
+    print("=" * 70)
+    from app.services.deduplication import get_url_filter
+    try:
+        # Get filter instance
+        url_filter = get_url_filter()
+        # Add some test URLs
+        test_urls = [
+            "https://test1.com/article",
+            "https://test2.com/news",
+            "https://test3.com/update"
+        ]
+        print("\nAdding test URLs to filter...")
+        for url in test_urls:
+            url_filter.check_and_add(url)
+        # Get stats before reset
+        stats_before = url_filter.get_stats()
+        print(f"\n📊 Before Reset:")
+        print(f"  Total Checks: {stats_before['total_checks']}")
+        print(f"  Unique URLs: {stats_before['unique_urls_added']}")
+        print(f"  Duplicates: {stats_before['duplicates_detected']}")
+        # Reset the filter
+        print("\n🔄 Resetting filter...")
+        url_filter.reset()
+        # Get stats after reset
+        stats_after = url_filter.get_stats()
+        print(f"\n📊 After Reset:")
+        print(f"  Total Checks: {stats_after['total_checks']}")
+        print(f"  Unique URLs: {stats_after['unique_urls_added']}")
+        print(f"  Duplicates: {stats_after['duplicates_detected']}")
+        # Verify reset worked
+        if stats_after['unique_urls_added'] == 0:
+            print("\n✅ Filter reset successfully (all counters zeroed)")
+            success = True
+        else:
+            print("\n❌ Filter reset failed (counters not zeroed)")
+            success = False
+    except Exception as e:
+        print(f"\n❌ Test failed with error: {e}")
+        import traceback
+        traceback.print_exc()
+        success = False
+    print("\n" + "=" * 70)
+    if success:
+        print("✅ TEST 3 PASSED: Bloom Filter reset works correctly")
+    else:
+        print("❌ TEST 3 FAILED: Bloom Filter reset has issues")
+    print("=" * 70)
+    print()
+    return success
+async def run_all_tests():
+    """Run all verification tests"""
+    print("\n" + "🚀" * 35)
+    print("INTEGRATION FIX VERIFICATION SUITE")
+    print("🚀" * 35 + "\n")
+    results = {}
+    # Test 1: ID Generation
+    results['id_generation'] = test_id_generation()
+    # Test 2: Validation
+    results['validation'] = test_validation_null_safety()
+    # Test 3: Bloom Filter
+    results['bloom_filter'] = await test_bloom_filter_reset()
+    # Summary
+    print("\n" + "=" * 70)
+    print("📋 FINAL SUMMARY")
+    print("=" * 70)
+    all_passed = all(results.values())
+    for test_name, passed in results.items():
+        status = "✅ PASS" if passed else "❌ FAIL"
+        print(f"{status}  {test_name.replace('_', ' ').title()}")
+    print("=" * 70)
+    if all_passed:
+        print("\n🎉 ALL TESTS PASSED - Integration fixes working correctly!")
+        print("✅ System is ready for production deployment")
+    else:
+        print("\n⚠️  SOME TESTS FAILED - Review errors above")
+        print("❌ Fix issues before deploying to production")
+    print("\n")
+    return all_passed
+if __name__ == "__main__":
+    # Run tests
+    success = asyncio.run(run_all_tests())
+    # Exit code for CI/CD
+    exit(0 if success else 1)