Multi-MCP / tools /pdf_qa.py
Zoro-147's picture
Upload 16 files
c4070e8 verified
import PyPDF2
import google.generativeai as genai
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
import os
# Initialize Qdrant client and Sentence Transformer model
client = QdrantClient(":memory:") # Use in-memory Qdrant for simplicity
encoder = SentenceTransformer('all-MiniLM-L6-v2')
COLLECTION_NAME = "pdf_documents"
def create_collection_if_not_exists():
try:
client.get_collection(collection_name=COLLECTION_NAME)
except Exception:
client.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=models.VectorParams(size=encoder.get_sentence_embedding_dimension(), distance=models.Distance.COSINE),
)
create_collection_if_not_exists()
def answer_pdf_question(pdf_path, question, gemini_model):
try:
# Extract text from PDF
text = ""
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
text += reader.pages[page_num].extract_text()
# Chunk text and generate embeddings
chunks = [text[i:i+500] for i in range(0, len(text), 500)] # Simple chunking
points = []
for i, chunk in enumerate(chunks):
points.append(
models.PointStruct(
id=i,
vector=encoder.encode(chunk).tolist(),
payload={"text": chunk}
)
)
client.upsert(
collection_name=COLLECTION_NAME,
points=points,
wait=True
)
# Search for relevant chunks
query_vector = encoder.encode(question).tolist()
search_result = client.search(
collection_name=COLLECTION_NAME,
query_vector=query_vector,
limit=3 # Retrieve top 3 most relevant chunks
)
context = " ".join([hit.payload['text'] for hit in search_result])
# Use Gemini to answer the question based on context
response = gemini_model.generate_content(f"Context: {context}\n\nQuestion: {question}\n\nAnswer:")
return response.text
except Exception as e:
return f"Error processing PDF or answering question: {e}"