newslens / src /db /vector_store.py
Jitender20's picture
Add NewsLens Streamlit app
208266a
import hashlib
import chromadb
from sentence_transformers import SentenceTransformer
from src.config import CHROMA_DB_PATH, HF_TOKEN
CHROMA_DB_PATH.mkdir(parents=True, exist_ok=True)
class NewsVectorStore:
_model = None
def __init__(self, collection_name = "news_articles"):
print(f"Initializing ChromaDB at {CHROMA_DB_PATH}...")
self.client = chromadb.PersistentClient(path=str(CHROMA_DB_PATH))
self.collection = self.client.get_or_create_collection(
name=collection_name,
metadata={"hnsw:space": "cosine"}
)
if NewsVectorStore._model is None:
print("Loading embedding model (this takes a few seconds)...")
NewsVectorStore._model = SentenceTransformer(
'all-MiniLM-L6-v2',
token=HF_TOKEN,
)
self.embedding_model = NewsVectorStore._model
print("ChromaDB initialized and embedding model loaded.")
def store_articles(self, articles_data):
"""
Expects a list of dictionaries from NewsAPI.
"""
if not articles_data:
print("No articles to store.")
return
documents = []
metadatas = []
ids = []
for article in articles_data:
url = article.get('url')
if not url:
continue
title = article.get('title') or ""
desc = article.get('description') or ""
content = article.get("content") or ""
text_to_embed = f"{title}. {desc}. {content}"
if len(text_to_embed.strip()) > 5:
documents.append(text_to_embed)
# Store metadata so we can display it later in the UI
metadatas.append({
"source": article.get('source', {}).get('name', 'Unknown'),
"url": url,
"publishedAt": article.get('publishedAt', ''),
"title": article.get('title') or "",
"description": article.get('description') or ""
})
doc_id = hashlib.md5(url.encode()).hexdigest()
ids.append(doc_id)
if not documents:
print("No valid documents to store.")
return
# Generate embeddings
print(f"Generating embeddings for {len(documents)} articles...")
embeddings = self.embedding_model.encode(documents,batch_size=32).tolist()
# Insert into ChromaDB
self.collection.upsert(
embeddings=embeddings,
documents=documents,
metadatas=metadatas,
ids=ids
)
print(f"Successfully stored {len(documents)} articles in ChromaDB!")
def query(self, topic: str, top_k: int = 10) -> list[dict]:
"""
Embed the query topic and retrieve the top-k most similar articles.
"""
print(f"querying chromaDB for the topic: '{topic}'")
query_embedding = self.embedding_model.encode([topic]).tolist()
results = self.collection.query(
query_embeddings=query_embedding,
n_results=top_k,
include=["documents", "metadatas", "distances"]
)
articles = []
for doc, meta, dist in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0]
):
articles.append({
"text": doc,
"source": meta.get("source", "Unknown"),
"url": meta.get("url", ""),
"publishedAt": meta.get("publishedAt", ""),
"similarity_score": round(1 - dist, 4),
"title": meta.get("title", ""),
"description": meta.get("description", ""),
})
print(f"Retrieved {len(articles)} articles.")
return articles
if __name__ == "__main__":
db = NewsVectorStore()
print(f"Total documents in collection: {db.collection.count()}")
results = db.collection.get()
urls = [m.get("url") for m in results["metadatas"]]
for url in urls:
print(url)