Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| # Load embedding model | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # Persistent Chroma DB path | |
| CHROMA_PATH = "./chroma_store" | |
| client = chromadb.PersistentClient(path=CHROMA_PATH) | |
| collection = None # global to store embeddings | |
| def process_pdf(pdf_file): | |
| global collection | |
| try: | |
| if not pdf_file: | |
| return "β Please upload a valid PDF file." | |
| pdf_path = pdf_file.name # β Use uploaded file path directly | |
| # Load PDF | |
| loader = PyPDFLoader(pdf_path) | |
| documents = loader.load() | |
| if not documents: | |
| return "β PDF loaded but no content found." | |
| # Split into chunks | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80) | |
| chunks = splitter.split_documents(documents) | |
| texts = [doc.page_content for doc in chunks if doc.page_content.strip()] | |
| if not texts: | |
| return "β Failed to extract valid text from PDF." | |
| # Get embeddings | |
| embeddings = model.encode(texts) | |
| # Reset collection | |
| collection_name = "pdf-documents" | |
| try: | |
| client.delete_collection(collection_name) | |
| except: | |
| pass | |
| collection = client.create_collection(collection_name) | |
| collection.add( | |
| documents=texts, | |
| embeddings=embeddings.tolist(), | |
| metadatas=[{"source": "pdf"} for _ in texts], | |
| ids=[f"chunk-{i}" for i in range(len(texts))] | |
| ) | |
| return f"β Successfully processed {len(texts)} chunks." | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| def query_pdf(query): | |
| if not collection: | |
| return "β No PDF has been processed yet." | |
| try: | |
| query_embedding = model.encode([query]) | |
| results = collection.query( | |
| query_embeddings=query_embedding.tolist(), | |
| n_results=3 | |
| ) | |
| docs = results.get("documents", [[]])[0] | |
| if not docs: | |
| return "β No matching results found." | |
| return "\n\n---\n\n".join(docs) | |
| except Exception as e: | |
| return f"β Query error: {str(e)}" | |
| # Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π Ask Questions About Your PDF") | |
| with gr.Row(): | |
| pdf_input = gr.File(label="π Upload PDF", file_types=[".pdf"]) | |
| upload_button = gr.Button("π€ Process PDF") | |
| status_output = gr.Textbox(label="Status") | |
| upload_button.click(fn=process_pdf, inputs=pdf_input, outputs=status_output) | |
| question_input = gr.Textbox(label="β Ask a question") | |
| answer_output = gr.Textbox(label="π¬ Answer") | |
| question_input.submit(fn=query_pdf, inputs=question_input, outputs=answer_output) | |
| demo.launch() | |