Spaces:
Sleeping
Sleeping
| from pdf_loader import load_pdf | |
| from optimal_chunker import chunk_documents | |
| from embedder_light import get_embedder, embed_text | |
| from vector_store import get_chroma_client, create_collection | |
| class RAGPipeline: | |
| def __init__(self): | |
| self.tokenizer, self.model = get_embedder() | |
| self.db_client = get_chroma_client() | |
| self.collection = create_collection(self.db_client) | |
| def index_document(self, pdf_path): | |
| print(f"π Loading: {pdf_path}") | |
| docs = load_pdf(pdf_path) | |
| print("βοΈ Chunking...") | |
| chunks = chunk_documents(docs) | |
| print("π’ Creating embeddings...") | |
| texts = [chunk.page_content for chunk in chunks] | |
| vectors = embed_text(texts, self.tokenizer, self.model) | |
| print("π§ Adding to ChromaDB...") | |
| ids = [f"doc_{i}" for i in range(len(texts))] | |
| self.collection.add(documents=texts, embeddings=vectors, ids=ids) | |
| print(f"β Indexed {len(texts)} chunks.") | |
| def query(self, question): | |
| print(f"β Question: {question}") | |
| question_vec = embed_text([question], self.tokenizer, self.model)[0] | |
| results = self.collection.query( | |
| query_embeddings=[question_vec], | |
| n_results=3 | |
| ) | |
| print(" | |
| π Top Documents:") | |
| for i, doc in enumerate(results["documents"][0]): | |
| print(f"{i+1}. {doc[:200]}... | |
| ") | |
| return results["documents"][0][0] |