Spaces:

broadfield-dev
/

RSS_News_1

Paused

broadfield-dev commited on Jun 23, 2025

Commit

8c48251

verified ·

1 Parent(s): 104836a

Update rss_processor.py

Files changed (1) hide show

rss_processor.py CHANGED Viewed

@@ -160,15 +160,19 @@ def categorize_feed(url):
         return "Uncategorized"
 def process_and_store_articles(articles):
-    if os.path.exists(LOCAL_DB_DIR):
-        shutil.rmtree(LOCAL_DB_DIR)
     vector_db = Chroma(
         persist_directory=LOCAL_DB_DIR,
         embedding_function=get_embedding_model(),
         collection_name=COLLECTION_NAME
     )
     docs_to_add = []
     ids_to_add = []
@@ -177,6 +181,9 @@ def process_and_store_articles(articles):
         cleaned_link = clean_text(article["link"])
         doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
         metadata = {
             "title": article["title"],
             "link": article["link"],
@@ -188,6 +195,7 @@ def process_and_store_articles(articles):
         doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
         docs_to_add.append(doc)
         ids_to_add.append(doc_id)
     if docs_to_add:
         try:

         return "Uncategorized"
 def process_and_store_articles(articles):
     vector_db = Chroma(
         persist_directory=LOCAL_DB_DIR,
         embedding_function=get_embedding_model(),
         collection_name=COLLECTION_NAME
     )
+    try:
+        existing_ids = set(vector_db.get(include=[])["ids"])
+        logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
+    except Exception as e:
+        logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
+        existing_ids = set()
     docs_to_add = []
     ids_to_add = []
         cleaned_link = clean_text(article["link"])
         doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
+        if doc_id in existing_ids:
+            continue
         metadata = {
             "title": article["title"],
             "link": article["link"],
         doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
         docs_to_add.append(doc)
         ids_to_add.append(doc_id)
+        existing_ids.add(doc_id)
     if docs_to_add:
         try: