|
|
import PyPDF2 |
|
|
import google.generativeai as genai |
|
|
from qdrant_client import QdrantClient, models |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import os |
|
|
|
|
|
|
|
|
client = QdrantClient(":memory:") |
|
|
encoder = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
COLLECTION_NAME = "pdf_documents" |
|
|
|
|
|
def create_collection_if_not_exists(): |
|
|
try: |
|
|
client.get_collection(collection_name=COLLECTION_NAME) |
|
|
except Exception: |
|
|
client.create_collection( |
|
|
collection_name=COLLECTION_NAME, |
|
|
vectors_config=models.VectorParams(size=encoder.get_sentence_embedding_dimension(), distance=models.Distance.COSINE), |
|
|
) |
|
|
|
|
|
create_collection_if_not_exists() |
|
|
|
|
|
def answer_pdf_question(pdf_path, question, gemini_model): |
|
|
try: |
|
|
|
|
|
text = "" |
|
|
with open(pdf_path, 'rb') as file: |
|
|
reader = PyPDF2.PdfReader(file) |
|
|
for page_num in range(len(reader.pages)): |
|
|
text += reader.pages[page_num].extract_text() |
|
|
|
|
|
|
|
|
chunks = [text[i:i+500] for i in range(0, len(text), 500)] |
|
|
points = [] |
|
|
for i, chunk in enumerate(chunks): |
|
|
points.append( |
|
|
models.PointStruct( |
|
|
id=i, |
|
|
vector=encoder.encode(chunk).tolist(), |
|
|
payload={"text": chunk} |
|
|
) |
|
|
) |
|
|
|
|
|
client.upsert( |
|
|
collection_name=COLLECTION_NAME, |
|
|
points=points, |
|
|
wait=True |
|
|
) |
|
|
|
|
|
|
|
|
query_vector = encoder.encode(question).tolist() |
|
|
search_result = client.search( |
|
|
collection_name=COLLECTION_NAME, |
|
|
query_vector=query_vector, |
|
|
limit=3 |
|
|
) |
|
|
|
|
|
context = " ".join([hit.payload['text'] for hit in search_result]) |
|
|
|
|
|
|
|
|
response = gemini_model.generate_content(f"Context: {context}\n\nQuestion: {question}\n\nAnswer:") |
|
|
return response.text |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error processing PDF or answering question: {e}" |