import gradio as gr from PyPDF2 import PdfReader import docx import os from dotenv import load_dotenv from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory from langchain_community.llms import HuggingFaceHub # Initialize conversation state conversation = None chat_history = [] def get_pdf_text(pdf_docs): """Improved PDF text extraction with error handling""" text = "" for pdf in pdf_docs: try: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: page_text = page.extract_text() if page_text: # Only add if text was extracted text += page_text + "\n" except Exception as e: print(f"Error reading PDF: {str(e)}") return text if text.strip() else None def get_text_chunks(text): """Split text into chunks""" if not text: return [] text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len ) return text_splitter.split_text(text) def get_vectorstore(text_chunks): """Create vector store using HuggingFace embeddings""" if not text_chunks: return None embeddings = HuggingFaceEmbeddings() return FAISS.from_texts(texts=text_chunks, embedding=embeddings) def get_conversation_chain(vectorstore): """Create conversation chain with HuggingFace model""" global conversation llm = HuggingFaceHub( repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512} ) memory = ConversationBufferMemory( memory_key='chat_history', return_messages=True ) conversation = ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(), memory=memory ) return conversation def process_files(files): """Handle file processing""" global conversation, chat_history if not files: return "Please upload files first" try: # Get PDF text raw_text = get_pdf_text(files) if not raw_text: return "❌ Could not extract text from PDF(s). The file may be scanned or corrupted." # Get text chunks text_chunks = get_text_chunks(raw_text) if not text_chunks: return "❌ No valid text chunks could be created." # Create vector store vectorstore = get_vectorstore(text_chunks) if not vectorstore: return "❌ Failed to create vector store." # Create conversation chain get_conversation_chain(vectorstore) return "✅ Files processed successfully! You can now ask questions." except Exception as e: return f"❌ Error processing files: {str(e)}" def ask_question(question, history): """Handle question answering""" global conversation, chat_history if not question: return history if not conversation: return history + [(question, "Please process files first")] try: response = conversation({"question": question}) answer = response["answer"] chat_history = response["chat_history"] return history + [(question, answer)] except Exception as e: return history + [(question, f"Error: {str(e)}")] # Gradio Interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 📄 Chat with PDFs") with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="Upload PDFs", file_types=[".pdf"], file_count="multiple" ) process_btn = gr.Button("Process") status = gr.Textbox(label="Status") with gr.Column(scale=2): chatbot = gr.Chatbot(label="Conversation") question = gr.Textbox( label="Your Question", placeholder="Ask about your documents..." ) submit_btn = gr.Button("Submit") # Event handlers process_btn.click( process_files, inputs=file_input, outputs=status ) submit_btn.click( ask_question, inputs=[question, chatbot], outputs=[chatbot] ) question.submit( ask_question, inputs=[question, chatbot], outputs=[chatbot] ) if __name__ == '__main__': load_dotenv() demo.launch()