Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from PyPDF2 import PdfReader | |
| import docx | |
| import os | |
| from dotenv import load_dotenv | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from langchain.chains import ConversationalRetrievalChain | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain_community.llms import HuggingFaceHub | |
| # Initialize conversation state | |
| conversation = None | |
| chat_history = [] | |
| def get_pdf_text(pdf_docs): | |
| """Improved PDF text extraction with error handling""" | |
| text = "" | |
| for pdf in pdf_docs: | |
| try: | |
| pdf_reader = PdfReader(pdf) | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: # Only add if text was extracted | |
| text += page_text + "\n" | |
| except Exception as e: | |
| print(f"Error reading PDF: {str(e)}") | |
| return text if text.strip() else None | |
| def get_text_chunks(text): | |
| """Split text into chunks""" | |
| if not text: | |
| return [] | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| length_function=len | |
| ) | |
| return text_splitter.split_text(text) | |
| def get_vectorstore(text_chunks): | |
| """Create vector store using HuggingFace embeddings""" | |
| if not text_chunks: | |
| return None | |
| embeddings = HuggingFaceEmbeddings() | |
| return FAISS.from_texts(texts=text_chunks, embedding=embeddings) | |
| def get_conversation_chain(vectorstore): | |
| """Create conversation chain with HuggingFace model""" | |
| global conversation | |
| llm = HuggingFaceHub( | |
| repo_id="google/flan-t5-xxl", | |
| model_kwargs={"temperature":0.5, "max_length":512} | |
| ) | |
| memory = ConversationBufferMemory( | |
| memory_key='chat_history', | |
| return_messages=True | |
| ) | |
| conversation = ConversationalRetrievalChain.from_llm( | |
| llm=llm, | |
| retriever=vectorstore.as_retriever(), | |
| memory=memory | |
| ) | |
| return conversation | |
| def process_files(files): | |
| """Handle file processing""" | |
| global conversation, chat_history | |
| if not files: | |
| return "Please upload files first" | |
| try: | |
| # Get PDF text | |
| raw_text = get_pdf_text(files) | |
| if not raw_text: | |
| return "β Could not extract text from PDF(s). The file may be scanned or corrupted." | |
| # Get text chunks | |
| text_chunks = get_text_chunks(raw_text) | |
| if not text_chunks: | |
| return "β No valid text chunks could be created." | |
| # Create vector store | |
| vectorstore = get_vectorstore(text_chunks) | |
| if not vectorstore: | |
| return "β Failed to create vector store." | |
| # Create conversation chain | |
| get_conversation_chain(vectorstore) | |
| return "β Files processed successfully! You can now ask questions." | |
| except Exception as e: | |
| return f"β Error processing files: {str(e)}" | |
| def ask_question(question, history): | |
| """Handle question answering""" | |
| global conversation, chat_history | |
| if not question: | |
| return history | |
| if not conversation: | |
| return history + [(question, "Please process files first")] | |
| try: | |
| response = conversation({"question": question}) | |
| answer = response["answer"] | |
| chat_history = response["chat_history"] | |
| return history + [(question, answer)] | |
| except Exception as e: | |
| return history + [(question, f"Error: {str(e)}")] | |
| # Gradio Interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π Chat with PDFs") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_input = gr.File( | |
| label="Upload PDFs", | |
| file_types=[".pdf"], | |
| file_count="multiple" | |
| ) | |
| process_btn = gr.Button("Process") | |
| status = gr.Textbox(label="Status") | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot(label="Conversation") | |
| question = gr.Textbox( | |
| label="Your Question", | |
| placeholder="Ask about your documents..." | |
| ) | |
| submit_btn = gr.Button("Submit") | |
| # Event handlers | |
| process_btn.click( | |
| process_files, | |
| inputs=file_input, | |
| outputs=status | |
| ) | |
| submit_btn.click( | |
| ask_question, | |
| inputs=[question, chatbot], | |
| outputs=[chatbot] | |
| ) | |
| question.submit( | |
| ask_question, | |
| inputs=[question, chatbot], | |
| outputs=[chatbot] | |
| ) | |
| if __name__ == '__main__': | |
| load_dotenv() | |
| demo.launch() |