SHAFI commited on
Commit
813a09f
Β·
1 Parent(s): ec623dd

fix: Integration fixes for Bloom Filter, ID mismatch, and validation crashes

Browse files
app/routes/admin.py CHANGED
@@ -545,3 +545,220 @@ async def preview_newsletter_content(preference: str):
545
  detail=f"Failed to preview content: {str(e)}"
546
  )
547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  detail=f"Failed to preview content: {str(e)}"
546
  )
547
 
548
+
549
+ # ===========================================
550
+ # Bloom Filter Management (Integration Fix #1)
551
+ # ===========================================
552
+
553
+ @router.post("/bloom-filter/reset")
554
+ async def reset_bloom_filter():
555
+ """
556
+ Reset Scalable Bloom Filter - Integration Sync Mechanism
557
+
558
+ **USE CASE**: After clearing the Appwrite database, call this endpoint
559
+ to reset the Bloom Filter to match the empty database state.
560
+
561
+ **WHY THIS IS NEEDED**:
562
+ In production (Hugging Face Spaces), the Bloom Filter persists on disk
563
+ even when the database is cleared via the Appwrite dashboard. This causes
564
+ 100% duplicate detection because the filter "remembers" old URLs that no
565
+ longer exist in the database.
566
+
567
+ **INTEGRATION CONTRACT**:
568
+ - When you clear Appwrite DB β†’ Call this endpoint
569
+ - Filter state syncs with database state
570
+ - Fresh ingestion can proceed
571
+
572
+ Returns:
573
+ Status and statistics before/after reset
574
+ """
575
+ try:
576
+ from app.services.deduplication import get_url_filter
577
+
578
+ # Get the global filter instance
579
+ url_filter = get_url_filter()
580
+
581
+ # Capture stats before reset
582
+ stats_before = url_filter.get_stats()
583
+
584
+ # Reset the filter
585
+ url_filter.reset()
586
+
587
+ # Capture stats after reset
588
+ stats_after = url_filter.get_stats()
589
+
590
+ return {
591
+ "success": True,
592
+ "message": "Scalable Bloom Filter reset successfully",
593
+ "operation": "Integration sync - Filter state now matches empty database",
594
+ "stats_before_reset": {
595
+ "total_checks": stats_before['total_checks'],
596
+ "unique_urls_added": stats_before['unique_urls_added'],
597
+ "duplicates_detected": stats_before['duplicates_detected'],
598
+ "filter_buckets": stats_before['filter_buckets'],
599
+ "estimated_capacity": stats_before['estimated_current_capacity']
600
+ },
601
+ "stats_after_reset": {
602
+ "filter_buckets": stats_after['filter_buckets'],
603
+ "last_reset": stats_after['last_reset']
604
+ },
605
+ "note": "Filter is now ready for fresh ingestion. Next fetch will save all articles."
606
+ }
607
+
608
+ except Exception as e:
609
+ raise HTTPException(
610
+ status_code=500,
611
+ detail=f"Failed to reset Bloom Filter: {str(e)}"
612
+ )
613
+
614
+
615
+ @router.get("/bloom-filter/stats")
616
+ async def get_bloom_filter_stats():
617
+ """
618
+ Get Scalable Bloom Filter statistics - Observability Endpoint
619
+
620
+ Shows:
621
+ - Total URLs processed
622
+ - Duplicate detection rate
623
+ - Filter bucket count (auto-scaling metric)
624
+ - Memory usage estimate
625
+ - Last persistence time
626
+
627
+ **PRODUCTION DIAGNOSTIC**: Use this to verify filter state
628
+ and detect saturation issues.
629
+
630
+ Returns:
631
+ Comprehensive filter statistics
632
+ """
633
+ try:
634
+ from app.services.deduplication import get_url_filter
635
+
636
+ url_filter = get_url_filter()
637
+ stats = url_filter.get_stats()
638
+
639
+ # Calculate additional metrics
640
+ memory_usage = url_filter.get_estimated_memory_usage()
641
+
642
+ return {
643
+ "success": True,
644
+ "filter_type": "ScalableBloomFilter (pybloom_live)",
645
+ "persistence_enabled": True,
646
+ "persistence_path": url_filter.persistence_path,
647
+ "statistics": {
648
+ "total_checks": stats['total_checks'],
649
+ "unique_urls_added": stats['unique_urls_added'],
650
+ "duplicates_detected": stats['duplicates_detected'],
651
+ "duplicate_rate_percent": stats['duplicate_rate_percent'],
652
+ "filter_buckets": stats['filter_buckets'],
653
+ "initial_capacity": stats['initial_capacity'],
654
+ "current_estimated_capacity": stats['estimated_current_capacity'],
655
+ "error_rate": stats['filter_error_rate'],
656
+ "is_scalable": stats['is_scalable'],
657
+ "last_reset": stats['last_reset'],
658
+ "last_save": stats['last_save']
659
+ },
660
+ "memory": {
661
+ "estimated_usage": memory_usage,
662
+ "note": "Scalable Bloom Filter auto-grows as needed"
663
+ },
664
+ "health": {
665
+ "status": "healthy" if stats['duplicate_rate_percent'] < 95 else "warning",
666
+ "warning": "100% duplicate rate detected - consider reset" if stats['duplicate_rate_percent'] >= 99.5 else None
667
+ }
668
+ }
669
+
670
+ except Exception as e:
671
+ raise HTTPException(
672
+ status_code=500,
673
+ detail=f"Failed to get Bloom Filter stats: {str(e)}"
674
+ )
675
+
676
+
677
+ @router.get("/bloom-filter/health")
678
+ async def bloom_filter_health_check():
679
+ """
680
+ Quick health check for Bloom Filter - Production Monitoring
681
+
682
+ Returns:
683
+ - Status: healthy/warning/critical
684
+ - Reason for any issues
685
+ - Recommended action
686
+
687
+ **ALERTING**: Use this for automated monitoring.
688
+ A "critical" status means ingestion is likely broken.
689
+ """
690
+ try:
691
+ from app.services.deduplication import get_url_filter
692
+ from app.services.appwrite_db import get_appwrite_db
693
+ import os
694
+
695
+ url_filter = get_url_filter()
696
+ stats = url_filter.get_stats()
697
+ appwrite_db = get_appwrite_db()
698
+
699
+ # Check 1: Duplicate rate health
700
+ duplicate_rate = stats['duplicate_rate_percent']
701
+
702
+ # Check 2: Filter persistence file exists
703
+ filter_file_exists = os.path.exists(url_filter.persistence_path)
704
+
705
+ # Check 3: Database initialized
706
+ db_initialized = appwrite_db.initialized
707
+
708
+ # Determine health status
709
+ issues = []
710
+ status = "healthy"
711
+
712
+ if duplicate_rate >= 99.5:
713
+ status = "critical"
714
+ issues.append({
715
+ "type": "duplicate_saturation",
716
+ "severity": "critical",
717
+ "details": f"Duplicate rate: {duplicate_rate}% (expected < 95%)",
718
+ "action": "Reset Bloom Filter via POST /admin/bloom-filter/reset"
719
+ })
720
+ elif duplicate_rate >= 90:
721
+ status = "warning"
722
+ issues.append({
723
+ "type": "high_duplicates",
724
+ "severity": "warning",
725
+ "details": f"Duplicate rate: {duplicate_rate}% (expected < 90%)",
726
+ "action": "Monitor ingestion logs for validation issues"
727
+ })
728
+
729
+ if not filter_file_exists:
730
+ issues.append({
731
+ "type": "missing_persistence",
732
+ "severity": "info",
733
+ "details": "Filter persistence file not found (filter will create on first save)",
734
+ "action": "No action needed - this is normal on first run"
735
+ })
736
+
737
+ if not db_initialized:
738
+ status = "critical"
739
+ issues.append({
740
+ "type": "database_disconnected",
741
+ "severity": "critical",
742
+ "details": "Appwrite database not initialized",
743
+ "action": "Check Appwrite credentials in environment variables"
744
+ })
745
+
746
+ return {
747
+ "status": status,
748
+ "timestamp": stats['last_reset'],
749
+ "checks_performed": {
750
+ "duplicate_rate": duplicate_rate,
751
+ "filter_persisted": filter_file_exists,
752
+ "database_initialized": db_initialized
753
+ },
754
+ "issues": issues if issues else [],
755
+ "recommendation": issues[0]["action"] if issues else "System healthy - all checks passed"
756
+ }
757
+
758
+ except Exception as e:
759
+ return {
760
+ "status": "error",
761
+ "error": str(e),
762
+ "recommendation": "Check server logs for detailed error information"
763
+ }
764
+
app/services/appwrite_db.py CHANGED
@@ -88,28 +88,37 @@ class AppwriteDatabase:
88
 
89
  def _generate_url_hash(self, url: str) -> str:
90
  """
91
- Generate a unique hash for an article URL (with canonicalization)
92
 
93
- Uses canonical URL normalization to catch duplicate stories:
94
- - https://cnn.com/story?utm_source=twitter
95
- - https://www.cnn.com/story?ref=homepage
96
- Both map to same hash!
 
 
 
 
 
 
 
97
 
98
  Args:
99
- url: Article URL
100
 
101
  Returns:
102
- 16-character hex hash
 
 
 
 
103
  """
104
- from app.utils.url_canonicalization import canonicalize_url
105
  import hashlib
106
 
107
- # Canonicalize URL first for better deduplication
108
- canonical_url = canonicalize_url(url)
109
 
110
- # Generate hash from canonical URL
111
- hash_bytes = hashlib.sha256(canonical_url.encode('utf-8')).hexdigest()
112
- return hash_bytes[:16] # First 16 characters
113
 
114
  async def get_articles(self, category: str, limit: int = 20, offset: int = 0) -> List[Dict]:
115
  """
 
88
 
89
  def _generate_url_hash(self, url: str) -> str:
90
  """
91
+ Generate a unique hash for an article URL
92
 
93
+ **INTEGRATION FIX #2**: Updated to match frontend ID generation
94
+
95
+ Uses SHA-256 hash of the RAW URL (not canonicalized) to ensure
96
+ Frontend and Backend generate IDENTICAL IDs for the same article.
97
+
98
+ **WHY THE CHANGE**:
99
+ - OLD: 16-char hash of canonical URL (different from frontend)
100
+ - NEW: 32-char hash of raw URL (matches frontend exactly)
101
+
102
+ **NOTE**: Canonicalization is still used for Appwrite deduplication
103
+ via the unique constraint, but NOT for ID generation.
104
 
105
  Args:
106
+ url: Article URL (raw, not canonicalized)
107
 
108
  Returns:
109
+ 32-character hex hash (Appwrite-compatible, frontend-compatible)
110
+
111
+ Example:
112
+ >>> _generate_url_hash("https://cnn.com/article?utm=123")
113
+ "a1b2c3d4e5f67890abcdef1234567890" # 32 chars
114
  """
 
115
  import hashlib
116
 
117
+ # Generate SHA-256 hash from RAW URL (no canonicalization for ID)
118
+ hash_bytes = hashlib.sha256(url.encode('utf-8')).hexdigest()
119
 
120
+ # Return first 32 characters (matches frontend idGenerator.ts)
121
+ return hash_bytes[:32]
 
122
 
123
  async def get_articles(self, category: str, limit: int = 20, offset: int = 0) -> List[Dict]:
124
  """
app/utils/data_validation.py CHANGED
@@ -278,8 +278,9 @@ def is_relevant_to_category(article: Union[Dict, 'Article'], category: str) -> b
278
  return True
279
 
280
  # Combine title and description for checking
281
- title = article_dict.get('title', '').lower()
282
- description = article_dict.get('description', '').lower()
 
283
  text = f"{title} {description}"
284
 
285
  # Count keyword matches
 
278
  return True
279
 
280
  # Combine title and description for checking
281
+ # FIX: Use (value or '') pattern to handle explicit None values from messy RSS feeds
282
+ title = (article_dict.get('title') or '').lower()
283
+ description = (article_dict.get('description') or '').lower()
284
  text = f"{title} {description}"
285
 
286
  # Count keyword matches
app/utils/id_generator.py CHANGED
@@ -17,6 +17,8 @@ def generate_article_id(url: str) -> str:
17
  """
18
  Generate Appwrite-compatible ID from URL
19
 
 
 
20
  Uses SHA-256 hash truncated to 32 characters to ensure
21
  it fits within Appwrite's 36-character document ID limit.
22
 
 
17
  """
18
  Generate Appwrite-compatible ID from URL
19
 
20
+ **INTEGRATION FIX #2**: Matches frontend and appwrite_db.py
21
+
22
  Uses SHA-256 hash truncated to 32 characters to ensure
23
  it fits within Appwrite's 36-character document ID limit.
24
 
verify_integration_fixes.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Integration Fix Verification Script
3
+ ====================================
4
+
5
+ Tests all 3 integration fixes to ensure they're working correctly.
6
+
7
+ Run this script to verify:
8
+ 1. Bloom Filter reset functionality
9
+ 2. ID generation consistency (32-char)
10
+ 3. Validation null-safety
11
+
12
+ Usage:
13
+ python verify_integration_fixes.py
14
+ """
15
+
16
+ import asyncio
17
+ import hashlib
18
+
19
+
20
+ def test_id_generation():
21
+ """
22
+ Test #2: Verify ID generation consistency
23
+
24
+ Frontend and Backend MUST generate identical IDs for the same URL
25
+ """
26
+ print("=" * 70)
27
+ print("πŸ” TEST 1: ID Generation Consistency")
28
+ print("=" * 70)
29
+
30
+ test_urls = [
31
+ "https://cnn.com/article?utm_source=twitter",
32
+ "https://techcrunch.com/2024/ai-news",
33
+ "https://example.com/test"
34
+ ]
35
+
36
+ # Import backend ID generators
37
+ from app.services.appwrite_db import AppwriteDatabase
38
+ from app.utils.id_generator import generate_article_id as backend_gen
39
+
40
+ db = AppwriteDatabase()
41
+
42
+ all_match = True
43
+
44
+ for url in test_urls:
45
+ # Backend Method 1 (appwrite_db)
46
+ id_appwrite = db._generate_url_hash(url)
47
+
48
+ # Backend Method 2 (id_generator)
49
+ id_backend = backend_gen(url)
50
+
51
+ # Simulate frontend (same algo)
52
+ hash_obj = hashlib.sha256(url.encode('utf-8'))
53
+ id_frontend = hash_obj.hexdigest()[:32]
54
+
55
+ print(f"\nURL: {url[:50]}...")
56
+ print(f" Frontend: {id_frontend} (len={len(id_frontend)})")
57
+ print(f" Backend1: {id_appwrite} (len={len(id_appwrite)})")
58
+ print(f" Backend2: {id_backend} (len={len(id_backend)})")
59
+
60
+ # Verify all match
61
+ if id_frontend == id_appwrite == id_backend and len(id_frontend) == 32:
62
+ print(" βœ… PASS: All IDs match and are 32 chars")
63
+ else:
64
+ print(" ❌ FAIL: ID mismatch detected!")
65
+ all_match = False
66
+
67
+ print("\n" + "=" * 70)
68
+ if all_match:
69
+ print("βœ… TEST 1 PASSED: ID generation is consistent across all components")
70
+ else:
71
+ print("❌ TEST 1 FAILED: ID generation mismatch detected")
72
+ print("=" * 70)
73
+ print()
74
+
75
+ return all_match
76
+
77
+
78
+ def test_validation_null_safety():
79
+ """
80
+ Test #3: Verify null-safety in validation
81
+
82
+ Should handle articles with None fields gracefully
83
+ """
84
+ print("=" * 70)
85
+ print("πŸ” TEST 2: Validation Null-Safety")
86
+ print("=" * 70)
87
+
88
+ from app.utils.data_validation import is_relevant_to_category
89
+
90
+ # Test articles with None fields (production edge case)
91
+ test_articles = [
92
+ {
93
+ "title": "AI News Update",
94
+ "description": None, # Explicit None
95
+ "url": "https://test.com/1"
96
+ },
97
+ {
98
+ "title": None, # Explicit None
99
+ "description": "Some content",
100
+ "url": "https://test.com/2"
101
+ },
102
+ {
103
+ "title": "Cloud Computing",
104
+ "description": "", # Empty string
105
+ "url": "https://test.com/3"
106
+ }
107
+ ]
108
+
109
+ all_pass = True
110
+
111
+ for i, article in enumerate(test_articles, 1):
112
+ try:
113
+ result = is_relevant_to_category(article, "ai")
114
+ print(f"\n Article {i}: βœ… No crash (relevant={result})")
115
+ print(f" Title: {article.get('title')}")
116
+ print(f" Description: {article.get('description')}")
117
+ except AttributeError as e:
118
+ print(f"\n Article {i}: ❌ CRASH - {e}")
119
+ all_pass = False
120
+ except Exception as e:
121
+ print(f"\n Article {i}: ⚠️ Other error - {e}")
122
+
123
+ print("\n" + "=" * 70)
124
+ if all_pass:
125
+ print("βœ… TEST 2 PASSED: Validation handles None fields gracefully")
126
+ else:
127
+ print("❌ TEST 2 FAILED: Validation crashes on None fields")
128
+ print("=" * 70)
129
+ print()
130
+
131
+ return all_pass
132
+
133
+
134
+ async def test_bloom_filter_reset():
135
+ """
136
+ Test #1: Verify Bloom Filter reset functionality
137
+
138
+ Should successfully reset and show before/after stats
139
+ """
140
+ print("=" * 70)
141
+ print("πŸ” TEST 3: Bloom Filter Reset")
142
+ print("=" * 70)
143
+
144
+ from app.services.deduplication import get_url_filter
145
+
146
+ try:
147
+ # Get filter instance
148
+ url_filter = get_url_filter()
149
+
150
+ # Add some test URLs
151
+ test_urls = [
152
+ "https://test1.com/article",
153
+ "https://test2.com/news",
154
+ "https://test3.com/update"
155
+ ]
156
+
157
+ print("\nAdding test URLs to filter...")
158
+ for url in test_urls:
159
+ url_filter.check_and_add(url)
160
+
161
+ # Get stats before reset
162
+ stats_before = url_filter.get_stats()
163
+ print(f"\nπŸ“Š Before Reset:")
164
+ print(f" Total Checks: {stats_before['total_checks']}")
165
+ print(f" Unique URLs: {stats_before['unique_urls_added']}")
166
+ print(f" Duplicates: {stats_before['duplicates_detected']}")
167
+
168
+ # Reset the filter
169
+ print("\nπŸ”„ Resetting filter...")
170
+ url_filter.reset()
171
+
172
+ # Get stats after reset
173
+ stats_after = url_filter.get_stats()
174
+ print(f"\nπŸ“Š After Reset:")
175
+ print(f" Total Checks: {stats_after['total_checks']}")
176
+ print(f" Unique URLs: {stats_after['unique_urls_added']}")
177
+ print(f" Duplicates: {stats_after['duplicates_detected']}")
178
+
179
+ # Verify reset worked
180
+ if stats_after['unique_urls_added'] == 0:
181
+ print("\nβœ… Filter reset successfully (all counters zeroed)")
182
+ success = True
183
+ else:
184
+ print("\n❌ Filter reset failed (counters not zeroed)")
185
+ success = False
186
+
187
+ except Exception as e:
188
+ print(f"\n❌ Test failed with error: {e}")
189
+ import traceback
190
+ traceback.print_exc()
191
+ success = False
192
+
193
+ print("\n" + "=" * 70)
194
+ if success:
195
+ print("βœ… TEST 3 PASSED: Bloom Filter reset works correctly")
196
+ else:
197
+ print("❌ TEST 3 FAILED: Bloom Filter reset has issues")
198
+ print("=" * 70)
199
+ print()
200
+
201
+ return success
202
+
203
+
204
+ async def run_all_tests():
205
+ """Run all verification tests"""
206
+ print("\n" + "πŸš€" * 35)
207
+ print("INTEGRATION FIX VERIFICATION SUITE")
208
+ print("πŸš€" * 35 + "\n")
209
+
210
+ results = {}
211
+
212
+ # Test 1: ID Generation
213
+ results['id_generation'] = test_id_generation()
214
+
215
+ # Test 2: Validation
216
+ results['validation'] = test_validation_null_safety()
217
+
218
+ # Test 3: Bloom Filter
219
+ results['bloom_filter'] = await test_bloom_filter_reset()
220
+
221
+ # Summary
222
+ print("\n" + "=" * 70)
223
+ print("πŸ“‹ FINAL SUMMARY")
224
+ print("=" * 70)
225
+
226
+ all_passed = all(results.values())
227
+
228
+ for test_name, passed in results.items():
229
+ status = "βœ… PASS" if passed else "❌ FAIL"
230
+ print(f"{status} {test_name.replace('_', ' ').title()}")
231
+
232
+ print("=" * 70)
233
+
234
+ if all_passed:
235
+ print("\nπŸŽ‰ ALL TESTS PASSED - Integration fixes working correctly!")
236
+ print("βœ… System is ready for production deployment")
237
+ else:
238
+ print("\n⚠️ SOME TESTS FAILED - Review errors above")
239
+ print("❌ Fix issues before deploying to production")
240
+
241
+ print("\n")
242
+ return all_passed
243
+
244
+
245
+ if __name__ == "__main__":
246
+ # Run tests
247
+ success = asyncio.run(run_all_tests())
248
+
249
+ # Exit code for CI/CD
250
+ exit(0 if success else 1)