import PyPDF2 import google.generativeai as genai from qdrant_client import QdrantClient, models from sentence_transformers import SentenceTransformer import os # Initialize Qdrant client and Sentence Transformer model client = QdrantClient(":memory:") # Use in-memory Qdrant for simplicity encoder = SentenceTransformer('all-MiniLM-L6-v2') COLLECTION_NAME = "pdf_documents" def create_collection_if_not_exists(): try: client.get_collection(collection_name=COLLECTION_NAME) except Exception: client.create_collection( collection_name=COLLECTION_NAME, vectors_config=models.VectorParams(size=encoder.get_sentence_embedding_dimension(), distance=models.Distance.COSINE), ) create_collection_if_not_exists() def answer_pdf_question(pdf_path, question, gemini_model): try: # Extract text from PDF text = "" with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) for page_num in range(len(reader.pages)): text += reader.pages[page_num].extract_text() # Chunk text and generate embeddings chunks = [text[i:i+500] for i in range(0, len(text), 500)] # Simple chunking points = [] for i, chunk in enumerate(chunks): points.append( models.PointStruct( id=i, vector=encoder.encode(chunk).tolist(), payload={"text": chunk} ) ) client.upsert( collection_name=COLLECTION_NAME, points=points, wait=True ) # Search for relevant chunks query_vector = encoder.encode(question).tolist() search_result = client.search( collection_name=COLLECTION_NAME, query_vector=query_vector, limit=3 # Retrieve top 3 most relevant chunks ) context = " ".join([hit.payload['text'] for hit in search_result]) # Use Gemini to answer the question based on context response = gemini_model.generate_content(f"Context: {context}\n\nQuestion: {question}\n\nAnswer:") return response.text except Exception as e: return f"Error processing PDF or answering question: {e}"