Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import tempfile | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from transformers import AutoModel, AutoTokenizer | |
| import torch | |
| from typing import List | |
| import numpy as np | |
| class TransformersEmbeddings: | |
| def __init__(self): | |
| self.model_name = "BAAI/bge-small-en" | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| self.model = AutoModel.from_pretrained(self.model_name) | |
| self.model.eval() | |
| def embed_documents(self, texts: List[str]) -> List[List[float]]: | |
| inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512) | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() | |
| return embeddings.tolist() | |
| def embed_query(self, text: str) -> List[float]: | |
| return self.embed_documents([text])[0] | |
| # Initialize components | |
| vectorstore = None | |
| embedding_model = TransformersEmbeddings() | |
| def process_pdf(file): | |
| global vectorstore | |
| try: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: | |
| tmp.write(file) | |
| tmp_path = tmp.name | |
| loader = PyPDFLoader(tmp_path) | |
| docs = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=800, | |
| chunk_overlap=150 | |
| ) | |
| splits = text_splitter.split_documents(docs) | |
| texts = [doc.page_content for doc in splits] | |
| metadatas = [doc.metadata for doc in splits] | |
| # Generate embeddings | |
| embeddings = embedding_model.embed_documents(texts) | |
| # Create FAISS index | |
| vectorstore = FAISS.from_texts( | |
| texts=texts, | |
| embedding=embedding_model.embed_query, | |
| metadatas=metadatas | |
| ) | |
| os.unlink(tmp_path) | |
| return "✅ PDF processed successfully!" | |
| except Exception as e: | |
| return f"❌ Error: {str(e)}" | |
| def answer_question(question): | |
| global vectorstore | |
| if not vectorstore: | |
| return "⚠️ Please upload a PDF first" | |
| try: | |
| docs = vectorstore.similarity_search(question, k=3) | |
| context = "\n\n".join([doc.page_content for doc in docs]) | |
| sources = "\n📄 Sources:\n" + "\n".join([f"- Page {doc.metadata.get('page', 'N/A') + 1}" for doc in docs]) | |
| answer = f"Relevant content from document:\n{context[:2000]}...\n{sources}" | |
| return answer | |
| except Exception as e: | |
| return f"❌ Error: {str(e)}" | |
| with gr.Blocks() as app: | |
| gr.Markdown("# 📄 PDF Question Answering System") | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload PDF", type="binary") | |
| upload_btn = gr.Button("Process PDF") | |
| status = gr.Textbox(label="Status") | |
| question = gr.Textbox(label="Your Question") | |
| answer = gr.Textbox(label="Answer", interactive=False, lines=10) | |
| ask_btn = gr.Button("Get Answer") | |
| upload_btn.click(process_pdf, inputs=file_input, outputs=status) | |
| ask_btn.click(answer_question, inputs=question, outputs=answer) | |
| app.launch() | |