newapp / app.py
sairaarif89's picture
Update app.py
ad20893 verified
import os
import gradio as gr
import tempfile
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from transformers import AutoModel, AutoTokenizer
import torch
from typing import List
import numpy as np
class TransformersEmbeddings:
def __init__(self):
self.model_name = "BAAI/bge-small-en"
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModel.from_pretrained(self.model_name)
self.model.eval()
def embed_documents(self, texts: List[str]) -> List[List[float]]:
inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
outputs = self.model(**inputs)
embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
return embeddings.tolist()
def embed_query(self, text: str) -> List[float]:
return self.embed_documents([text])[0]
# Initialize components
vectorstore = None
embedding_model = TransformersEmbeddings()
def process_pdf(file):
global vectorstore
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(file)
tmp_path = tmp.name
loader = PyPDFLoader(tmp_path)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=150
)
splits = text_splitter.split_documents(docs)
texts = [doc.page_content for doc in splits]
metadatas = [doc.metadata for doc in splits]
# Generate embeddings
embeddings = embedding_model.embed_documents(texts)
# Create FAISS index
vectorstore = FAISS.from_texts(
texts=texts,
embedding=embedding_model.embed_query,
metadatas=metadatas
)
os.unlink(tmp_path)
return "✅ PDF processed successfully!"
except Exception as e:
return f"❌ Error: {str(e)}"
def answer_question(question):
global vectorstore
if not vectorstore:
return "⚠️ Please upload a PDF first"
try:
docs = vectorstore.similarity_search(question, k=3)
context = "\n\n".join([doc.page_content for doc in docs])
sources = "\n📄 Sources:\n" + "\n".join([f"- Page {doc.metadata.get('page', 'N/A') + 1}" for doc in docs])
answer = f"Relevant content from document:\n{context[:2000]}...\n{sources}"
return answer
except Exception as e:
return f"❌ Error: {str(e)}"
with gr.Blocks() as app:
gr.Markdown("# 📄 PDF Question Answering System")
with gr.Row():
file_input = gr.File(label="Upload PDF", type="binary")
upload_btn = gr.Button("Process PDF")
status = gr.Textbox(label="Status")
question = gr.Textbox(label="Your Question")
answer = gr.Textbox(label="Answer", interactive=False, lines=10)
ask_btn = gr.Button("Get Answer")
upload_btn.click(process_pdf, inputs=file_input, outputs=status)
ask_btn.click(answer_question, inputs=question, outputs=answer)
app.launch()