Spaces:
Paused
Paused
Update rss_processor.py
Browse files- rss_processor.py +11 -3
rss_processor.py
CHANGED
|
@@ -160,15 +160,19 @@ def categorize_feed(url):
|
|
| 160 |
return "Uncategorized"
|
| 161 |
|
| 162 |
def process_and_store_articles(articles):
|
| 163 |
-
if os.path.exists(LOCAL_DB_DIR):
|
| 164 |
-
shutil.rmtree(LOCAL_DB_DIR)
|
| 165 |
-
|
| 166 |
vector_db = Chroma(
|
| 167 |
persist_directory=LOCAL_DB_DIR,
|
| 168 |
embedding_function=get_embedding_model(),
|
| 169 |
collection_name=COLLECTION_NAME
|
| 170 |
)
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
docs_to_add = []
|
| 173 |
ids_to_add = []
|
| 174 |
|
|
@@ -177,6 +181,9 @@ def process_and_store_articles(articles):
|
|
| 177 |
cleaned_link = clean_text(article["link"])
|
| 178 |
doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
|
| 179 |
|
|
|
|
|
|
|
|
|
|
| 180 |
metadata = {
|
| 181 |
"title": article["title"],
|
| 182 |
"link": article["link"],
|
|
@@ -188,6 +195,7 @@ def process_and_store_articles(articles):
|
|
| 188 |
doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
|
| 189 |
docs_to_add.append(doc)
|
| 190 |
ids_to_add.append(doc_id)
|
|
|
|
| 191 |
|
| 192 |
if docs_to_add:
|
| 193 |
try:
|
|
|
|
| 160 |
return "Uncategorized"
|
| 161 |
|
| 162 |
def process_and_store_articles(articles):
|
|
|
|
|
|
|
|
|
|
| 163 |
vector_db = Chroma(
|
| 164 |
persist_directory=LOCAL_DB_DIR,
|
| 165 |
embedding_function=get_embedding_model(),
|
| 166 |
collection_name=COLLECTION_NAME
|
| 167 |
)
|
| 168 |
|
| 169 |
+
try:
|
| 170 |
+
existing_ids = set(vector_db.get(include=[])["ids"])
|
| 171 |
+
logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
|
| 172 |
+
except Exception as e:
|
| 173 |
+
logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
|
| 174 |
+
existing_ids = set()
|
| 175 |
+
|
| 176 |
docs_to_add = []
|
| 177 |
ids_to_add = []
|
| 178 |
|
|
|
|
| 181 |
cleaned_link = clean_text(article["link"])
|
| 182 |
doc_id = f"{cleaned_title}|{cleaned_link}|{article['published']}"
|
| 183 |
|
| 184 |
+
if doc_id in existing_ids:
|
| 185 |
+
continue
|
| 186 |
+
|
| 187 |
metadata = {
|
| 188 |
"title": article["title"],
|
| 189 |
"link": article["link"],
|
|
|
|
| 195 |
doc = Document(page_content=clean_text(article["description"]), metadata=metadata)
|
| 196 |
docs_to_add.append(doc)
|
| 197 |
ids_to_add.append(doc_id)
|
| 198 |
+
existing_ids.add(doc_id)
|
| 199 |
|
| 200 |
if docs_to_add:
|
| 201 |
try:
|