Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from PyPDF2 import PdfReader | |
| # LangChain components | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_core.prompts import PromptTemplate | |
| # Hugging Face Transformers | |
| from transformers import pipeline | |
| # ---------------- Load LLM ---------------- | |
| def load_llm(): | |
| try: | |
| # Use a model that's good at instruction following | |
| pipe = pipeline( | |
| "text2text-generation", | |
| model="google/flan-t5-base", | |
| max_length=512, | |
| temperature=0.1 # Lower temperature for more focused answers | |
| ) | |
| print("✅ Successfully loaded model: google/flan-t5-base") | |
| return pipe | |
| except Exception as e: | |
| print(f"⚠️ Failed to load model: {e}") | |
| return None | |
| llm = load_llm() | |
| # ---------------- Process PDF ---------------- | |
| def process_pdf(pdf_files): | |
| text = "" | |
| for pdf in pdf_files: | |
| reader = PdfReader(pdf) | |
| for page in reader.pages: | |
| extracted = page.extract_text() | |
| if extracted: | |
| text += extracted + "\n" | |
| if not text.strip(): | |
| return None | |
| # Split text into chunks | |
| splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100) | |
| texts = splitter.split_text(text) | |
| # Embeddings & vector store | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| db = FAISS.from_texts(texts, embeddings) | |
| return db | |
| # ---------------- Ask Questions ---------------- | |
| def ask_question(pdf_files, question): | |
| try: | |
| if not pdf_files: | |
| return "⚠️ Please upload at least one PDF file." | |
| if not llm: | |
| return "⚠️ Language model failed to load. Please try again later." | |
| db = process_pdf(pdf_files) | |
| if not db: | |
| return "⚠️ No text found in the uploaded PDF(s)." | |
| retriever = db.as_retriever(search_kwargs={"k": 4}) | |
| docs = retriever.get_relevant_documents(question) | |
| # Combine retrieved context | |
| context = "\n".join([doc.page_content for doc in docs]) | |
| # Clean up context to remove excessive whitespace | |
| context = " ".join(context.split()) | |
| # Better prompt template that forces the model to answer | |
| prompt = f"""Based on the following information, answer the question clearly and concisely. | |
| Information: | |
| {context} | |
| Question: {question} | |
| Answer:""" | |
| # Generate response | |
| result = llm( | |
| prompt, | |
| max_length=300, | |
| num_return_sequences=1, | |
| do_sample=False, | |
| temperature=0.1 | |
| ) | |
| response = result[0]['generated_text'].strip() | |
| # Clean up the response | |
| if response.startswith("Answer:"): | |
| response = response.replace("Answer:", "").strip() | |
| # If response is empty or just repeats the prompt, provide fallback | |
| if not response or len(response) < 10: | |
| return "I couldn't find a clear answer to your question in the provided documents. Please try rephrasing your question or check if the relevant information is in the uploaded PDFs." | |
| return response | |
| except Exception as e: | |
| return f"⚠️ Error: {str(e)}" | |
| # ---------------- Gradio UI ---------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 📚 PDF Question Answering System") | |
| gr.Markdown("Upload PDF files and ask questions about their content.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf_input = gr.File( | |
| label="Upload PDF Files", | |
| file_types=[".pdf"], | |
| file_count="multiple" | |
| ) | |
| with gr.Column(): | |
| question_input = gr.Textbox( | |
| label="Your Question", | |
| placeholder="What would you like to know about the document?", | |
| lines=2 | |
| ) | |
| submit_btn = gr.Button("Ask Question", variant="primary") | |
| with gr.Row(): | |
| output = gr.Textbox( | |
| label="Answer", | |
| lines=4, | |
| interactive=False | |
| ) | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["What is the main topic of this document?"], | |
| ["Can you summarize the key points?"], | |
| ["What are the main findings or conclusions?"], | |
| ["Who are the authors and what are their credentials?"] | |
| ], | |
| inputs=question_input, | |
| label="Example Questions" | |
| ) | |
| # Handle both button click and enter key | |
| submit_btn.click(ask_question, inputs=[pdf_input, question_input], outputs=output) | |
| question_input.submit(ask_question, inputs=[pdf_input, question_input], outputs=output) | |
| demo.launch() |