pdf_rag / src /embedding.py
LightRT's picture
Final Formatting
bb05158
from src.ingestion import ingestion_and_chunking
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, SparseVectorParams, PointStruct
from fastembed import TextEmbedding, SparseTextEmbedding
import uuid
from dotenv import load_dotenv
import os
load_dotenv()
qdrant_api_key = os.getenv("QDRANT_API_KEY")
qdrant_url = os.getenv("QDRANT_URL")
def upload_file(file_path: str, user_id: str, collection_name="pdf_rag"):
client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
dense_model = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config={
"dense": VectorParams(size=384, distance=Distance.COSINE)
},
sparse_vectors_config={
"sparse": SparseVectorParams()
}
)
docs = ingestion_and_chunking(file_path)
texts = [doc.page_content for doc in docs]
dense_vectors = list(dense_model.embed(texts))
sparse_vectors = list(sparse_model.embed(texts))
points = []
file_id = str(uuid.uuid4())
for i, doc in enumerate(docs):
dense_vec = dense_vectors[i].tolist()
sparse_emb = sparse_vectors[i]
sparse_vec = {
"indices": sparse_emb.indices.tolist(),
"values": sparse_emb.values.tolist()
}
chunk_id = str(uuid.uuid4())
point = PointStruct(
id=chunk_id,
vector={
"dense": dense_vec,
"sparse": sparse_vec
},
payload={
"user_id": user_id,
"file_id": file_id,
"text": doc.page_content,
"source": doc.metadata.get("source"),
"pages": doc.metadata.get("pages"),
"section": doc.metadata.get("section")
}
)
points.append(point)
try:
client.create_payload_index(
collection_name=collection_name,
field_name="user_id",
field_schema="keyword"
)
except Exception:
pass
client.upsert(collection_name=collection_name, points=points)