Spaces:

himanshukumar378
/

pdfchatbot

Sleeping

File size: 4,737 Bytes

import gradio as gr
from PyPDF2 import PdfReader
import docx
import os
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import HuggingFaceHub

# Initialize conversation state
conversation = None
chat_history = []

def get_pdf_text(pdf_docs):
    """Improved PDF text extraction with error handling"""
    text = ""
    for pdf in pdf_docs:
        try:
            pdf_reader = PdfReader(pdf)
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                if page_text:  # Only add if text was extracted
                    text += page_text + "\n"
        except Exception as e:
            print(f"Error reading PDF: {str(e)}")
    return text if text.strip() else None

def get_text_chunks(text):
    """Split text into chunks"""
    if not text:
        return []
    
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    return text_splitter.split_text(text)

def get_vectorstore(text_chunks):
    """Create vector store using HuggingFace embeddings"""
    if not text_chunks:
        return None
    
    embeddings = HuggingFaceEmbeddings()
    return FAISS.from_texts(texts=text_chunks, embedding=embeddings)

def get_conversation_chain(vectorstore):
    """Create conversation chain with HuggingFace model"""
    global conversation
    
    llm = HuggingFaceHub(
        repo_id="google/flan-t5-xxl",
        model_kwargs={"temperature":0.5, "max_length":512}
    )
    
    memory = ConversationBufferMemory(
        memory_key='chat_history',
        return_messages=True
    )
    
    conversation = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation

def process_files(files):
    """Handle file processing"""
    global conversation, chat_history
    
    if not files:
        return "Please upload files first"
    
    try:
        # Get PDF text
        raw_text = get_pdf_text(files)
        if not raw_text:
            return "❌ Could not extract text from PDF(s). The file may be scanned or corrupted."
        
        # Get text chunks
        text_chunks = get_text_chunks(raw_text)
        if not text_chunks:
            return "❌ No valid text chunks could be created."
        
        # Create vector store
        vectorstore = get_vectorstore(text_chunks)
        if not vectorstore:
            return "❌ Failed to create vector store."
        
        # Create conversation chain
        get_conversation_chain(vectorstore)
        return "✅ Files processed successfully! You can now ask questions."
    
    except Exception as e:
        return f"❌ Error processing files: {str(e)}"

def ask_question(question, history):
    """Handle question answering"""
    global conversation, chat_history
    
    if not question:
        return history
    
    if not conversation:
        return history + [(question, "Please process files first")]
    
    try:
        response = conversation({"question": question})
        answer = response["answer"]
        chat_history = response["chat_history"]
        return history + [(question, answer)]
    except Exception as e:
        return history + [(question, f"Error: {str(e)}")]

# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📄 Chat with PDFs")
    
    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(
                label="Upload PDFs",
                file_types=[".pdf"],
                file_count="multiple"
            )
            process_btn = gr.Button("Process")
            status = gr.Textbox(label="Status")
        
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="Conversation")
            question = gr.Textbox(
                label="Your Question",
                placeholder="Ask about your documents..."
            )
            submit_btn = gr.Button("Submit")
    
    # Event handlers
    process_btn.click(
        process_files,
        inputs=file_input,
        outputs=status
    )
    
    submit_btn.click(
        ask_question,
        inputs=[question, chatbot],
        outputs=[chatbot]
    )
    
    question.submit(
        ask_question,
        inputs=[question, chatbot],
        outputs=[chatbot]
    )

if __name__ == '__main__':
    load_dotenv()
    demo.launch()