Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +14 -10
rss_processor.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import feedparser
|
| 3 |
-
from
|
| 4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 5 |
from langchain.docstore.document import Document
|
| 6 |
import logging
|
|
@@ -163,14 +163,11 @@ def process_and_store_articles(articles):
|
|
| 163 |
if not os.path.exists(LOCAL_DB_DIR):
|
| 164 |
os.makedirs(LOCAL_DB_DIR)
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
embedding_function=get_embedding_model(),
|
| 169 |
-
collection_name=COLLECTION_NAME
|
| 170 |
-
)
|
| 171 |
|
| 172 |
try:
|
| 173 |
-
existing_ids = set(
|
| 174 |
logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
|
| 175 |
except Exception as e:
|
| 176 |
logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
|
|
@@ -201,9 +198,16 @@ def process_and_store_articles(articles):
|
|
| 201 |
|
| 202 |
if docs_to_add:
|
| 203 |
try:
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
except Exception as e:
|
| 208 |
logger.error(f"Error storing articles: {e}")
|
| 209 |
|
|
|
|
| 1 |
import os
|
| 2 |
import feedparser
|
| 3 |
+
from chromadb import Client, Documents
|
| 4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 5 |
from langchain.docstore.document import Document
|
| 6 |
import logging
|
|
|
|
| 163 |
if not os.path.exists(LOCAL_DB_DIR):
|
| 164 |
os.makedirs(LOCAL_DB_DIR)
|
| 165 |
|
| 166 |
+
client = Client(persist_directory=LOCAL_DB_DIR)
|
| 167 |
+
collection = client.get_or_create_collection(name=COLLECTION_NAME)
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
try:
|
| 170 |
+
existing_ids = set(collection.get(include=[])["ids"])
|
| 171 |
logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
|
| 172 |
except Exception as e:
|
| 173 |
logger.info(f"No existing DB found or error loading IDs: {e}. Starting fresh.")
|
|
|
|
| 198 |
|
| 199 |
if docs_to_add:
|
| 200 |
try:
|
| 201 |
+
embeddings = get_embedding_model()
|
| 202 |
+
for doc, doc_id in zip(docs_to_add, ids_to_add):
|
| 203 |
+
collection.add(
|
| 204 |
+
documents=[doc.page_content],
|
| 205 |
+
metadatas=[doc.metadata],
|
| 206 |
+
ids=[doc_id],
|
| 207 |
+
embeddings=[embeddings.embed_query(doc.page_content)]
|
| 208 |
+
)
|
| 209 |
+
client.persist()
|
| 210 |
+
logger.info(f"Added {len(docs_to_add)} new articles to DB. Total in DB: {collection.count()}")
|
| 211 |
except Exception as e:
|
| 212 |
logger.error(f"Error storing articles: {e}")
|
| 213 |
|