Spaces:

WORKWITHSHAFISK
/

segmentopulse-backend

Paused

App Files Files Community

SHAFI commited on Feb 2

Commit

3690599

1 Parent(s): 3d33162

added migration for likes, views, dislikes, seperate collection for cloud news, route news to new collection

Browse files

Files changed (8) hide show

app/config.py +3 -2
app/main.py +4 -0
app/routes/engagement.py +332 -0
app/services/ingestion_v2.py +130 -20
app/services/news_providers.py +1 -1
app/services/scheduler.py +81 -10
app/services/vector_store.py +66 -15
app/utils.py +162 -0

app/config.py CHANGED Viewed

@@ -50,12 +50,13 @@ class Settings(BaseSettings):
     # Frontend URL (for unsubscribe links)
     FRONTEND_URL: str = "https://segmento.in"
-    # Appwrite Database (Phase 2)
     APPWRITE_ENDPOINT: str = "https://nyc.cloud.appwrite.io/v1"
     APPWRITE_PROJECT_ID: str = ""
     APPWRITE_API_KEY: str = ""
     APPWRITE_DATABASE_ID: str = "segmento_db"
-    APPWRITE_COLLECTION_ID: str = "articles"
     # Admin Alerting (Optional - Discord/Slack webhook URL)
     ADMIN_WEBHOOK_URL: Optional[str] = None

     # Frontend URL (for unsubscribe links)
     FRONTEND_URL: str = "https://segmento.in"
+    # Appwrite Database
     APPWRITE_ENDPOINT: str = "https://nyc.cloud.appwrite.io/v1"
     APPWRITE_PROJECT_ID: str = ""
     APPWRITE_API_KEY: str = ""
     APPWRITE_DATABASE_ID: str = "segmento_db"
+    APPWRITE_COLLECTION_ID: str = "articles"  # Regular articles
+    APPWRITE_CLOUD_COLLECTION_ID: str = ""  # Phase 3: Cloud news (to be created)
     # Admin Alerting (Optional - Discord/Slack webhook URL)
     ADMIN_WEBHOOK_URL: Optional[str] = None

app/main.py CHANGED Viewed

@@ -70,6 +70,10 @@ app.include_router(analytics.router, prefix="/api/analytics", tags=["Analytics"]
 app.include_router(subscription.router, tags=["Subscription"])
 app.include_router(admin.router, prefix="/api/admin", tags=["Admin"])
 @app.get("/")
 async def root():
     """Root endpoint"""

 app.include_router(subscription.router, tags=["Subscription"])
 app.include_router(admin.router, prefix="/api/admin", tags=["Admin"])
+# Phase 3: Engagement tracking
+from app.routes import engagement
+app.include_router(engagement.router, prefix="/api/engagement", tags=["Engagement"])
 @app.get("/")
 async def root():
     """Root endpoint"""

app/routes/engagement.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""
+Engagement API Endpoints
+Handles article likes, views tracking, and trending articles
+"""
+from fastapi import APIRouter, HTTPException, Depends
+from typing import Optional
+from app.services.appwrite_db import get_appwrite_db
+from app.config import settings
+from datetime import datetime, timedelta
+import logging
+logger = logging.getLogger(__name__)
+router = APIRouter()
+@router.post("/articles/{article_id}/like")
+async def like_article(article_id: str):
+    """
+    Increment like count for an article.
+    Phase 3: Engagement tracking for article popularity.
+    Args:
+        article_id: Document ID from Appwrite
+    Returns:
+        Updated likes count
+    """
+    try:
+        appwrite_db = get_appwrite_db()
+        # Try regular articles collection first
+        collection_id = settings.APPWRITE_COLLECTION_ID
+        try:
+            doc = appwrite_db.databases.get_document(
+                database_id=settings.APPWRITE_DATABASE_ID,
+                collection_id=collection_id,
+                document_id=article_id
+            )
+        except:
+            # Try cloud articles collection
+            if settings.APPWRITE_CLOUD_COLLECTION_ID:
+                collection_id = settings.APPWRITE_CLOUD_COLLECTION_ID
+                doc = appwrite_db.databases.get_document(
+                    database_id=settings.APPWRITE_DATABASE_ID,
+                    collection_id=collection_id,
+                    document_id=article_id
+                )
+            else:
+                raise HTTPException(status_code=404, detail="Article not found")
+        # Increment likes
+        current_likes = doc.get('likes', 0)
+        new_likes = current_likes + 1
+        # Update document
+        appwrite_db.databases.update_document(
+            database_id=settings.APPWRITE_DATABASE_ID,
+            collection_id=collection_id,
+            document_id=article_id,
+            data={"likes": new_likes}
+        )
+        logger.info(f"❤️  Article {article_id[:8]}... liked (total: {new_likes})")
+        return {
+            "article_id": article_id,
+            "likes": new_likes,
+            "success": True
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error liking article {article_id}: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/articles/{article_id}/dislike")
+async def dislike_article(article_id: str):
+    """
+    Increment dislike count for an article.
+    Phase 3: Engagement tracking for article feedback.
+    Args:
+        article_id: Document ID from Appwrite
+    Returns:
+        Updated dislikes count
+    """
+    try:
+        appwrite_db = get_appwrite_db()
+        # Try regular articles collection first
+        collection_id = settings.APPWRITE_COLLECTION_ID
+        try:
+            doc = appwrite_db.databases.get_document(
+                database_id=settings.APPWRITE_DATABASE_ID,
+                collection_id=collection_id,
+                document_id=article_id
+            )
+        except:
+            # Try cloud articles collection
+            if settings.APPWRITE_CLOUD_COLLECTION_ID:
+                collection_id = settings.APPWRITE_CLOUD_COLLECTION_ID
+                doc = appwrite_db.databases.get_document(
+                    database_id=settings.APPWRITE_DATABASE_ID,
+                    collection_id=collection_id,
+                    document_id=article_id
+                )
+            else:
+                raise HTTPException(status_code=404, detail="Article not found")
+        # Increment dislikes
+        current_dislikes = doc.get('dislikes', 0)
+        new_dislikes = current_dislikes + 1
+        # Update document
+        appwrite_db.databases.update_document(
+            database_id=settings.APPWRITE_DATABASE_ID,
+            collection_id=collection_id,
+            document_id=article_id,
+            data={"dislikes": new_dislikes}
+        )
+        logger.info(f"👎 Article {article_id[:8]}... disliked (total: {new_dislikes})")
+        return {
+            "article_id": article_id,
+            "dislikes": new_dislikes,
+            "success": True
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error disliking article {article_id}: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/articles/{article_id}/view")
+async def track_view(article_id: str):
+    """
+    Increment view count for an article.
+    Phase 3: Track article views for analytics.
+    Args:
+        article_id: Document ID from Appwrite
+    Returns:
+        Updated views count
+    """
+    try:
+        appwrite_db = get_appwrite_db()
+        # Try regular articles collection first
+        collection_id = settings.APPWRITE_COLLECTION_ID
+        try:
+            doc = appwrite_db.databases.get_document(
+                database_id=settings.APPWRITE_DATABASE_ID,
+                collection_id=collection_id,
+                document_id=article_id
+            )
+        except:
+            # Try cloud articles collection
+            if settings.APPWRITE_CLOUD_COLLECTION_ID:
+                collection_id = settings.APPWRITE_CLOUD_COLLECTION_ID
+                doc = appwrite_db.databases.get_document(
+                    database_id=settings.APPWRITE_DATABASE_ID,
+                    collection_id=collection_id,
+                    document_id=article_id
+                )
+            else:
+                raise HTTPException(status_code=404, detail="Article not found")
+        # Increment views
+        current_views = doc.get('views', 0)
+        new_views = current_views + 1
+        # Update document
+        appwrite_db.databases.update_document(
+            database_id=settings.APPWRITE_DATABASE_ID,
+            collection_id=collection_id,
+            document_id=article_id,
+            data={"views": new_views}
+        )
+        # Log only every 10 views to avoid spam
+        if new_views % 10 == 0:
+            logger.info(f"👁️  Article {article_id[:8]}... reached {new_views} views")
+        return {
+            "article_id": article_id,
+            "views": new_views,
+            "success": True
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error tracking view for {article_id}: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/articles/trending")
+async def get_trending_articles(
+    hours: int = 24,
+    limit: int = 10,
+    cloud_only: bool = False
+):
+    """
+    Get trending articles based on views and likes.
+    Phase 3: Discover popular content.
+    Args:
+        hours: Time window for trending (default: 24 hours)
+        limit: Number of articles to return (default: 10)
+        cloud_only: Only return cloud articles (default: False)
+    Returns:
+        List of trending articles sorted by engagement
+    """
+    try:
+        from appwrite.query import Query
+        appwrite_db = get_appwrite_db()
+        cutoff = (datetime.now() - timedelta(hours=hours)).isoformat()
+        # Determine collection
+        if cloud_only and settings.APPWRITE_CLOUD_COLLECTION_ID:
+            collection_id = settings.APPWRITE_CLOUD_COLLECTION_ID
+        else:
+            collection_id = settings.APPWRITE_COLLECTION_ID
+        # Query articles, sorted by views (descending)
+        response = appwrite_db.databases.list_documents(
+            database_id=settings.APPWRITE_DATABASE_ID,
+            collection_id=collection_id,
+            queries=[
+                Query.greater_than('publishedAt', cutoff),
+                Query.order_desc('views'),
+                Query.limit(limit)
+            ]
+        )
+        articles = response['documents']
+        # Calculate engagement score (views + likes * 5 - dislikes * 3)
+        # Likes are weighted higher, dislikes have negative impact
+        for article in articles:
+            views = article.get('views', 0)
+            likes = article.get('likes', 0)
+            dislikes = article.get('dislikes', 0)
+            article['engagement_score'] = views + (likes * 5) - (dislikes * 3)
+        # Sort by engagement score
+        articles.sort(key=lambda x: x.get('engagement_score', 0), reverse=True)
+        logger.info(f"🔥 Trending: {len(articles)} articles in last {hours}h")
+        return {
+            "articles": articles[:limit],
+            "timeframe_hours": hours,
+            "cloud_only": cloud_only,
+            "total_count": len(articles)
+        }
+    except Exception as e:
+        logger.error(f"Error getting trending articles: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/articles/popular-cloud")
+async def get_popular_cloud_articles(provider: Optional[str] = None, limit: int = 10):
+    """
+    Get popular cloud articles, optionally filtered by provider.
+    Phase 3: Cloud-specific trending.
+    Args:
+        provider: Cloud provider (aws, azure, gcp, etc.) or None for all
+        limit: Number of articles (default: 10)
+    Returns:
+        Popular cloud articles
+    """
+    try:
+        from appwrite.query import Query
+        if not settings.APPWRITE_CLOUD_COLLECTION_ID:
+            raise HTTPException(status_code=404, detail="Cloud collection not configured")
+        appwrite_db = get_appwrite_db()
+        queries = [
+            Query.order_desc('views'),
+            Query.limit(limit)
+        ]
+        # Filter by provider if specified
+        if provider:
+            queries.insert(0, Query.equal('provider', provider))
+        response = appwrite_db.databases.list_documents(
+            database_id=settings.APPWRITE_DATABASE_ID,
+            collection_id=settings.APPWRITE_CLOUD_COLLECTION_ID,
+            queries=queries
+        )
+        articles = response['documents']
+        logger.info(f"☁️  Popular cloud articles: {len(articles)} (provider={provider or 'all'})")
+        return {
+            "articles": articles,
+            "provider": provider,
+            "total_count": len(articles)
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error getting popular cloud articles: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

app/services/ingestion_v2.py CHANGED Viewed

@@ -34,8 +34,80 @@ logger = get_professional_logger(__name__)
 # ============================================================================
 # Space B Configuration
 # ============================================================================
-SPACE_B_URL = "https://workwithshafisk-segmentopulse-factory.hf.space"  # Note: Update this with your actual Space B URL
 SPACE_B_TIMEOUT = 30  # seconds (Llama-3 is slow on CPU)
@@ -313,26 +385,37 @@ async def fetch_latest_news(categories: List[str]) -> Dict[str, List[Article]]:
 # Article Processing with Space B + ChromaDB
 # ============================================================================
-async def process_and_store_article(url: str, raw_text: str, category: str, title: str = "") -> Optional[Dict]:
     """
-    Phase 2 CQRS: Offload processing to Space B, then store in ChromaDB
     Architecture:
     1. Send raw_text to Space B's /process-article endpoint
     2. Receive summary + tags from Space B
-    3. Generate embeddings locally using sentence-transformers
-    4. Store in ChromaDB
     Args:
         url: Article URL (used as ID)
         raw_text: Full article content
         category: Article category
         title: Article title (optional)
     Returns:
         Dictionary with processing results or None on error
     """
     try:
         logger.space_b_call(url, "started")
         # -------------------------------------------------------------------------
@@ -377,57 +460,84 @@ async def process_and_store_article(url: str, raw_text: str, category: str, titl
             return None
         # -------------------------------------------------------------------------
-        # Step 2: Generate embeddings locally with sentence-transformers
         # -------------------------------------------------------------------------
-        # ChromaDB vector_store has embedded model (all-MiniLM-L6-v2)
-        # We'll use the existing upsert_article method
         # -------------------------------------------------------------------------
-        # Step 3: Prepare article data for ChromaDB
         # -------------------------------------------------------------------------
         url_hash = hashlib.md5(url.encode()).hexdigest()
         article_data = {
             "$id": url_hash,
-            "title": title or summary[:100],  # Use title if available, else first part of summary
-            "description": summary,
             "url": url,
             "source": "Segmento AI",
             "category": category,
             "published_at": datetime.now().isoformat(),
             "image": "",  # No image for now
-            "tags": tags
         }
         # -------------------------------------------------------------------------
-        # Step 4: Store in ChromaDB
         # -------------------------------------------------------------------------
-        # Create combined text: Title + Summary + Tags (for richer embeddings)
         tags_text = " ".join(tags) if tags else ""
-        combined_analysis = f"Summary: {summary}\nTags: {tags_text}"
         # Upsert to vector store (handles embedding generation internally)
         vector_store.upsert_article(article_data, combined_analysis)
         ingestion_stats.chromadb_upserts += 1
         ingestion_stats.articles_saved += 1
-        logger.success(f"ChromaDB stored: {title[:50] if title else url[:50]}")
         return {
             "url": url,
-            "summary": summary,
             "tags": tags,
             "stored": True
         }
     except Exception as e:
-        logger.error(f"[CQRS] Processing failed for {url}: {e}")
         return None
 async def fetch_single_category(category: str) -> List[Article]:
-    """
-    Convenience function to fetch a single category
     Args:
         category: Category name

 # ============================================================================
 # Space B Configuration
 # ============================================================================
+# Constants
+SPACE_B_URL = "https://workwithshafisk-segmentopulse-factory.hf.space"
+# Phase 3: Cloud News Categories
+CLOUD_CATEGORIES = [
+    "cloud-aws",
+    "cloud-azure",
+    "cloud-gcp",
+    "cloud-oracle",
+    "cloud-ibm",
+    "cloud-alibaba",
+    "cloud-digitalocean",
+    "cloud-huawei",
+    "cloud-cloudflare",
+    "cloud-computing"  # General cloud news
+]
+# Phase 3: Official Cloud Provider Feeds
+OFFICIAL_CLOUD_FEEDS = {
+    "https://aws.amazon.com/blogs/aws/feed/": ("aws", True),
+    "https://azure.microsoft.com/en-us/blog/feed/": ("azure", True),
+    "https://cloudblog.withgoogle.com/rss/": ("gcp", True),
+    "https://blogs.oracle.com/cloud-infrastructure/rss": ("oracle", True),
+    "https://www.ibm.com/blog/category/ibm-cloud/feed/": ("ibm", True),
+    "https://www.alibabacloud.com/blog/rss.xml": ("alibaba", True),
+    "https://www.digitalocean.com/blog/rss.xml": ("digitalocean", True),
+    "https://developer.huaweicloud.com/intl/en-us/feed": ("huawei", True),
+    "https://blog.cloudflare.com/rss/": ("cloudflare", True)
+}
+def determine_cloud_provider(category: str, source_feed: str) -> tuple:
+    """
+    Phase 3: Determine cloud provider and whether article is from official blog.
+    Args:
+        category: News category (e.g., "cloud-aws")
+        source_feed: RSS feed URL
+    Returns:
+        Tuple of (provider_name, is_official)
+    Examples:
+        ("aws", True) - From aws.amazon.com/blogs
+        ("azure", False) - From Google News about Azure
+    """
+    # Check if from official feed
+    if source_feed in OFFICIAL_CLOUD_FEEDS:
+        return OFFICIAL_CLOUD_FEEDS[source_feed]
+    # From news API - extract provider from category
+    if category.startswith('cloud-'):
+        provider = category.replace('cloud-', '')
+        return (provider, False)
+    return ("general", False)
+def route_to_collection(category: str, config_obj) -> str:
+    """
+    Phase 3: Determine which Appwrite collection to use.
+    Args:
+        category: Article category
+        config_obj: Settings object with collection IDs
+    Returns:
+        Collection ID string
+    """
+    if category in CLOUD_CATEGORIES and config_obj.APPWRITE_CLOUD_COLLECTION_ID:
+        return config_obj.APPWRITE_CLOUD_COLLECTION_ID
+    else:
+        return config_obj.APPWRITE_COLLECTION_ID
 SPACE_B_TIMEOUT = 30  # seconds (Llama-3 is slow on CPU)
 # Article Processing with Space B + ChromaDB
 # ============================================================================
+async def process_and_store_article(
+    url: str,
+    raw_text: str,
+    category: str,
+    title: str = "",
+    source_feed: str = ""
+) -> Optional[Dict]:
     """
+    Phase 3: Enhanced processing with cloud detection and engagement metrics
     Architecture:
     1. Send raw_text to Space B's /process-article endpoint
     2. Receive summary + tags from Space B
+    3. Detect cloud provider and routing
+    4. Add engagement metrics (likes, views)
+    5. Generate embeddings locally using sentence-transformers
+    6. Store in ChromaDB with rich metadata
     Args:
         url: Article URL (used as ID)
         raw_text: Full article content
         category: Article category
         title: Article title (optional)
+        source_feed: RSS feed URL (for cloud detection)
     Returns:
         Dictionary with processing results or None on error
     """
     try:
+        from app.utils import strip_html_if_needed, list_to_comma_separated
         logger.space_b_call(url, "started")
         # -------------------------------------------------------------------------
             return None
         # -------------------------------------------------------------------------
+        # Step 2: Phase 3 - Cloud Detection
         # -------------------------------------------------------------------------
+        is_cloud = category in CLOUD_CATEGORIES
+        provider, is_official = determine_cloud_provider(category, source_feed)
+        if is_cloud:
+            logger.info(f"☁️  Cloud article detected: {provider} (official={is_official})")
         # -------------------------------------------------------------------------
+        # Step 3: Phase 3 - HTML Stripping & Text Cleaning
+        # -------------------------------------------------------------------------
+        title_clean = strip_html_if_needed(title) if title else summary[:100]
+        summary_clean = strip_html_if_needed(summary)
+        # -------------------------------------------------------------------------
+        # Step 4: Prepare article data for ChromaDB with Phase 3 metadata
         # -------------------------------------------------------------------------
         url_hash = hashlib.md5(url.encode()).hexdigest()
+        # Convert tags list to comma-separated string
+        tags_str = list_to_comma_separated(tags)
         article_data = {
             "$id": url_hash,
+            # Core content (cleaned)
+            "title": title_clean,
+            "description": summary_clean,
             "url": url,
             "source": "Segmento AI",
             "category": category,
             "published_at": datetime.now().isoformat(),
             "image": "",  # No image for now
+            # Phase 3: Tags from GLiNER
+            "tags": tags_str,
+            # Phase 3: Cloud detection
+            "is_cloud_news": is_cloud,
+            "cloud_provider": provider if is_cloud else "",
+            "is_official": is_official if is_cloud else False,
+            # Phase 3: Engagement metrics
+            "likes": 0,
+            "dislikes": 0,
+            "views": 0
         }
         # -------------------------------------------------------------------------
+        # Step 5: Store in ChromaDB with Phase 3 enhanced schema
         # -------------------------------------------------------------------------
+        # Create combined text for embedding: Title + Summary + Tags
         tags_text = " ".join(tags) if tags else ""
+        combined_analysis = f"Summary: {summary_clean}\nTags: {tags_text}"
         # Upsert to vector store (handles embedding generation internally)
         vector_store.upsert_article(article_data, combined_analysis)
         ingestion_stats.chromadb_upserts += 1
         ingestion_stats.articles_saved += 1
+        cloud_emoji = "☁️" if is_cloud else "📰"
+        logger.success(f"{cloud_emoji} ChromaDB stored: {title_clean[:50]}")
         return {
             "url": url,
+            "summary": summary_clean,
             "tags": tags,
+            "is_cloud": is_cloud,
+            "provider": provider if is_cloud else None,
             "stored": True
         }
     except Exception as e:
+        logger.error(f"[Phase 3 CQRS] Processing failed for {url}: {e}")
         return None
 async def fetch_single_category(category: str) -> List[Article]:
     Args:
         category: Category name

app/services/news_providers.py CHANGED Viewed

@@ -494,7 +494,7 @@ class OfficialCloudProvider(NewsProvider):
             'cloud-alibaba': 'https://www.alibabacloud.com/blog/feed',
             'cloud-digitalocean': 'https://www.digitalocean.com/blog/rss.xml',
             'cloud-cloudflare': 'https://blog.cloudflare.com/rss/',
-            'cloud-huawei': 'https://blog.huawei.com/feed/', # Generic Huawei blog often used
         }
     async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:

             'cloud-alibaba': 'https://www.alibabacloud.com/blog/feed',
             'cloud-digitalocean': 'https://www.digitalocean.com/blog/rss.xml',
             'cloud-cloudflare': 'https://blog.cloudflare.com/rss/',
+            'cloud-huawei': 'https://blog.huawei.com', # Generic Huawei blog often used
         }
     async def fetch_news(self, category: str, limit: int = 20) -> List[Article]:

app/services/scheduler.py CHANGED Viewed

@@ -417,7 +417,11 @@ async def cleanup_old_news():
         logger.info("📅 Cutoff Date: %s", cutoff_date.strftime('%Y-%m-%d %H:%M:%S'))
         logger.info("🗑️  Articles published before this will be deleted...")
-        # Query and delete old articles
         logger.info("🔍 Querying Appwrite for old articles...")
         from appwrite.query import Query
@@ -430,11 +434,11 @@ async def cleanup_old_news():
             ]
         )
-        logger.info("📊 Found %d old articles to delete", len(response['documents']))
-        deleted_count = 0
         if len(response['documents']) > 0:
-            logger.info("🗑️  Deleting articles...")
         for doc in response['documents']:
             try:
@@ -450,14 +454,74 @@ async def cleanup_old_news():
                 except Exception as ve:
                     logger.warning("⚠️  Vector delete failed (non-critical): %s", ve)
-                deleted_count += 1
-                if deleted_count % 10 == 0:
-                    logger.info("   Progress: %d articles deleted...", deleted_count)
             except Exception as e:
                 logger.error("❌ Error deleting document %s: %s", doc['$id'], e)
-        # Clear Redis cache to force refresh from updated database
-        logger.info("🔄 Clearing Redis cache...")
         cache_service = CacheService()
         cache_cleared = 0
         for category in CATEGORIES:
@@ -470,10 +534,17 @@ async def cleanup_old_news():
         if cache_cleared > 0:
             logger.info("✅ Cache cleared for %d categories", cache_cleared)
         logger.info("")
         logger.info("═" * 80)
         logger.info("🎉 [CLEANUP JANITOR] COMPLETED!")
-        logger.info("🗑️  Total Deleted: %d articles", deleted_count)
         logger.info("⏰ Retention: Articles older than %d hours removed", retention_hours)
         logger.info("🕐 Completion Time: %s", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
         logger.info("═" * 80)

         logger.info("📅 Cutoff Date: %s", cutoff_date.strftime('%Y-%m-%d %H:%M:%S'))
         logger.info("🗑️  Articles published before this will be deleted...")
+        # =========================================================================
+        # Step 1: Clean Regular Articles Collection
+        # =========================================================================
+        logger.info("")
+        logger.info("📰 [STEP 1] Cleaning regular articles...")
         logger.info("🔍 Querying Appwrite for old articles...")
         from appwrite.query import Query
             ]
         )
+        logger.info("📊 Found %d old regular articles to delete", len(response['documents']))
+        deleted_regular = 0
         if len(response['documents']) > 0:
+            logger.info("🗑️  Deleting regular articles...")
         for doc in response['documents']:
             try:
                 except Exception as ve:
                     logger.warning("⚠️  Vector delete failed (non-critical): %s", ve)
+                deleted_regular += 1
+                if deleted_regular % 10 == 0:
+                    logger.info("   Progress: %d regular articles deleted...", deleted_regular)
             except Exception as e:
                 logger.error("❌ Error deleting document %s: %s", doc['$id'], e)
+        logger.info("✅ Regular articles cleanup: %d deleted", deleted_regular)
+        # =========================================================================
+        # Step 2: Clean Cloud Articles Collection (Phase 3)
+        # =========================================================================
+        deleted_cloud = 0
+        # Only clean cloud collection if it's configured
+        if settings.APPWRITE_CLOUD_COLLECTION_ID:
+            logger.info("")
+            logger.info("☁️  [STEP 2] Cleaning cloud articles...")
+            logger.info("🔍 Querying Appwrite for old cloud articles...")
+            try:
+                cloud_response = appwrite_db.databases.list_documents(
+                    database_id=settings.APPWRITE_DATABASE_ID,
+                    collection_id=settings.APPWRITE_CLOUD_COLLECTION_ID,
+                    queries=[
+                        Query.less_than('published_at', cutoff_iso),
+                        Query.limit(500)
+                    ]
+                )
+                logger.info("📊 Found %d old cloud articles to delete", len(cloud_response['documents']))
+                if len(cloud_response['documents']) > 0:
+                    logger.info("🗑️  Deleting cloud articles...")
+                for doc in cloud_response['documents']:
+                    try:
+                        appwrite_db.databases.delete_document(
+                            database_id=settings.APPWRITE_DATABASE_ID,
+                            collection_id=settings.APPWRITE_CLOUD_COLLECTION_ID,
+                            document_id=doc['$id']
+                        )
+                        # Cleanup from ChromaDB as well
+                        try:
+                            vector_store.delete_vector(doc['$id'])
+                        except Exception as ve:
+                            logger.warning("⚠️  Vector delete failed (non-critical): %s", ve)
+                        deleted_cloud += 1
+                        if deleted_cloud % 10 == 0:
+                            logger.info("   Progress: %d cloud articles deleted...", deleted_cloud)
+                    except Exception as e:
+                        logger.error("❌ Error deleting cloud document %s: %s", doc['$id'], e)
+                logger.info("✅ Cloud articles cleanup: %d deleted", deleted_cloud)
+            except Exception as e:
+                logger.warning("⚠️  Cloud collection cleanup skipped: %s", e)
+                logger.info("💡 Cloud collection may not exist yet - this is normal on first run")
+        else:
+            logger.info("")
+            logger.info("⏭️  [STEP 2] Skipping cloud articles (collection not configured)")
+        # =========================================================================
+        # Step 3: Clear Redis Cache
+        # =========================================================================
+        logger.info("")
+        logger.info("🔄 [STEP 3] Clearing Redis cache...")
         cache_service = CacheService()
         cache_cleared = 0
         for category in CATEGORIES:
         if cache_cleared > 0:
             logger.info("✅ Cache cleared for %d categories", cache_cleared)
+        # =========================================================================
+        # Final Summary
+        # =========================================================================
+        total_deleted = deleted_regular + deleted_cloud
         logger.info("")
         logger.info("═" * 80)
         logger.info("🎉 [CLEANUP JANITOR] COMPLETED!")
+        logger.info("🗑️  Total Deleted: %d articles", total_deleted)
+        logger.info("   📰 Regular: %d", deleted_regular)
+        logger.info("   ☁️  Cloud: %d", deleted_cloud)
         logger.info("⏰ Retention: Articles older than %d hours removed", retention_hours)
         logger.info("🕐 Completion Time: %s", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
         logger.info("═" * 80)

app/services/vector_store.py CHANGED Viewed

@@ -58,7 +58,14 @@ class VectorStore:
     def upsert_article(self, article_data: Dict, analysis_result: str):
         """
-        Convert article + analysis into vector and save to ChromaDB.
         """
         if not self._initialized:
             self._initialize()
@@ -67,47 +74,91 @@ class VectorStore:
             return
         try:
-            # Prepare text for embedding: Title + Summary + Analysis
-            # We treat the "analysis" as high-value semantic content
-            combined_text = f"{article_data.get('title', '')} \n {article_data.get('description', '')} \n {analysis_result}"
             # Observability: Log what we are embedding
-            logger.info("📝 [Index] Embedding Article: '%s'", article_data.get('title', '')[:50])
-            logger.info("   -> Content Length: %d chars", len(combined_text))
             # Generate embedding
-            embedding = self.embedder.encode(combined_text).tolist()
-            # Metadata for filtering
             metadata = {
                 "source": article_data.get('source', 'Unknown'),
                 "category": article_data.get('category', 'General'),
-                "published_at": str(article_data.get('published_at', '')),
                 "url": article_data.get('url', ''),
-                "title": article_data.get('title', ''),                     # NEW: Store for search retrieval
-                "description": article_data.get('description', ''),         # NEW: Store for search retrieval
-                "image": article_data.get('image', '')                      # NEW: Store for search retrieval
             }
             # Upsert to ChromaDB
             # Use Appwrite Document ID ($id) as the ChromaDB ID for 1:1 mapping
             doc_id = article_data.get('$id')
             if not doc_id:
-                # Fallback if no ID provided (shouldn't happen with shadow path)
                 doc_id = article_data.get('url_hash', 'unknown')
             self.collection.upsert(
                 ids=[doc_id],
                 embeddings=[embedding],
                 metadatas=[metadata],
-                documents=[combined_text] # Optional: store raw text for debugging
             )
-            logger.info("🧠 [ChromaDB] Upserted vector for: %s", article_data.get('title')[:30])
         except Exception as e:
             logger.error("❌ [ChromaDB] Upsert failed: %s", e)
     def search_articles(self, query: str, limit: int = 10) -> List[Dict]:
         """
         Semantic Search: Find articles conceptually similar to the query.

     def upsert_article(self, article_data: Dict, analysis_result: str):
         """
+        Phase 3: Enhanced vector storage with rich metadata
+        Converts article + AI analysis into searchable vector with:
+        - Optimized embedding format: "{Title} : {Summary}"
+        - Cloud news detection
+        - Engagement metrics (likes, views)
+        - Time-aware sorting (Unix timestamp)
+        - Tag-based filtering (GLiNER output)
         """
         if not self._initialized:
             self._initialize()
             return
         try:
+            # Import HTML stripping utility
+            from app.utils import strip_html_if_needed
+            import time
+            # Clean text (only strips if HTML detected)
+            title_clean = strip_html_if_needed(article_data.get('title', ''))
+            desc_clean = strip_html_if_needed(article_data.get('description', ''))
+            # Phase 3: Optimized Combined Embedding
+            # Format: "{Title} : {Summary}"
+            # The colon separator helps the model distinguish title from body
+            text_to_embed = f"{title_clean} : {analysis_result}"
             # Observability: Log what we are embedding
+            logger.info("📝 [Index] Embedding Article: '%s'", title_clean[:50])
+            logger.info("   -> Format: '{Title} : {Summary}'")
+            logger.info("   -> Total Length: %d chars", len(text_to_embed))
             # Generate embedding
+            embedding = self.embedder.encode(text_to_embed).tolist()
+            # Phase 3: Enhanced Metadata Schema
             metadata = {
+                # Core identification
                 "source": article_data.get('source', 'Unknown'),
                 "category": article_data.get('category', 'General'),
                 "url": article_data.get('url', ''),
+                # Display data (cleaned)
+                "title": title_clean,
+                "description": desc_clean,
+                "image": article_data.get('image', ''),
+                # Phase 3: Filtering & Search
+                "tags": article_data.get('tags', ''),  # GLiNER output (comma-separated)
+                # Phase 3: Time-aware ranking
+                "timestamp": int(time.time()),  # Unix timestamp (numeric, sortable)
+                "published_at": str(article_data.get('published_at', '')),  # ISO string
+                # Phase 3: Future features
+                "audio_url": "",  # Placeholder for TTS
+                # Phase 3: Cloud detection
+                "is_cloud_news": article_data.get('is_cloud_news', False),
+                "cloud_provider": article_data.get('cloud_provider', ''),  # "aws", "azure", etc.
+                "is_official": article_data.get('is_official', False),  # True if official blog
+                # Phase 3: Engagement metrics (for ranking)
+                "likes": article_data.get('likes', 0),
+                "dislikes": article_data.get('dislikes', 0),
+                "views": article_data.get('views', 0),
+                # Phase 3: Schema versioning
+                "processing_version": "v2_phase3"
             }
+            # Phase 3: Document field = Llama-3 summary ONLY (not original HTML)
+            document = analysis_result
             # Upsert to ChromaDB
             # Use Appwrite Document ID ($id) as the ChromaDB ID for 1:1 mapping
             doc_id = article_data.get('$id')
             if not doc_id:
+                # Fallback if no ID provided
                 doc_id = article_data.get('url_hash', 'unknown')
             self.collection.upsert(
                 ids=[doc_id],
                 embeddings=[embedding],
                 metadatas=[metadata],
+                documents=[document]
             )
+            # Enhanced logging
+            cloud_status = "☁️ CLOUD" if metadata['is_cloud_news'] else "📰 REGULAR"
+            logger.info("🧠 [ChromaDB] Upserted: %s | %s | Tags: %s",
+                       title_clean[:30],
+                       cloud_status,
+                       metadata['tags'][:30] if metadata['tags'] else 'None')
         except Exception as e:
             logger.error("❌ [ChromaDB] Upsert failed: %s", e)
     def search_articles(self, query: str, limit: int = 10) -> List[Dict]:
         """
         Semantic Search: Find articles conceptually similar to the query.

app/utils.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+Utility Functions for Segmento Pulse
+Provides common helpers for text processing, HTML cleaning, and data transformation
+"""
+import re
+from html import unescape
+def strip_html_if_needed(text: str) -> str:
+    """
+    Intelligently strip HTML only if HTML tags are detected.
+    This optimization avoids unnecessary regex processing when text is already clean.
+    RSS feeds can return either plain text or HTML - we handle both efficiently.
+    Args:
+        text: Input text (may or may not contain HTML)
+    Returns:
+        Cleaned text without HTML tags or entities
+    Examples:
+        >>> strip_html_if_needed("Plain text")
+        'Plain text'
+        >>> strip_html_if_needed("<b>Bold</b> text")
+        'Bold text'
+        >>> strip_html_if_needed("AT&amp;T announces...")
+        'AT&T announces...'
+    """
+    if not text:
+        return ""
+    # Quick check: does this text have HTML?
+    # This avoids expensive regex on plain text
+    if '<' not in text and '>' not in text and '&' not in text:
+        return text.strip()  # Already clean!
+    # HTML detected - perform full cleanup
+    # Step 1: Remove HTML tags
+    text = re.sub(r'<[^>]+>', '', text)
+    # Step 2: Decode HTML entities (&amp; → &, &lt; → <, etc.)
+    text = unescape(text)
+    # Step 3: Clean excessive whitespace
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def detect_html(text: str) -> bool:
+    """
+    Quickly detect if text contains HTML markup.
+    Args:
+        text: Text to check
+    Returns:
+        True if HTML tags detected, False otherwise
+    """
+    if not text:
+        return False
+    return '<' in text or '>' in text
+def truncate_text(text: str, max_length: int = 200, suffix: str = "...") -> str:
+    """
+    Safely truncate text to maximum length.
+    Args:
+        text: Text to truncate
+        max_length: Maximum length (default: 200)
+        suffix: Suffix to add if truncated (default: "...")
+    Returns:
+        Truncated text
+    """
+    if not text or len(text) <= max_length:
+        return text
+    return text[:max_length - len(suffix)].strip() + suffix
+def normalize_url(url: str) -> str:
+    """
+    Normalize URL for deduplication.
+    - Converts to lowercase
+    - Removes trailing slashes
+    - Strips whitespace
+    Args:
+        url: URL to normalize
+    Returns:
+        Normalized URL
+    """
+    if not url:
+        return ""
+    return url.strip().rstrip('/').lower()
+def extract_domain(url: str) -> str:
+    """
+    Extract domain from URL.
+    Args:
+        url: Full URL
+    Returns:
+        Domain name (e.g., "techcrunch.com")
+    """
+    import re
+    # Remove protocol
+    domain = re.sub(r'^https?://', '', url)
+    # Remove path
+    domain = domain.split('/')[0]
+    # Remove www.
+    domain = domain.replace('www.', '')
+    return domain.lower()
+def comma_separated_to_list(text: str) -> list:
+    """
+    Convert comma-separated string to list.
+    Args:
+        text: Comma-separated string (e.g., "AI,Tech,Cloud")
+    Returns:
+        List of strings (e.g., ["AI", "Tech", "Cloud"])
+    """
+    if not text:
+        return []
+    return [item.strip() for item in text.split(',') if item.strip()]
+def list_to_comma_separated(items: list) -> str:
+    """
+    Convert list to comma-separated string.
+    Args:
+        items: List of strings
+    Returns:
+        Comma-separated string
+    """
+    if not items:
+        return ""
+    return ",".join(str(item).strip() for item in items if item)