Spaces:

himanshukumar378
/

pdfchatbot

Sleeping

App Files Files Community

Himanshu kumar Vishwakrma commited on Jul 20, 2025

Commit

074614d

1 Parent(s): e46a2d5

Streamlit to the gradio

Browse files

Files changed (1) hide show

app.py +122 -102

app.py CHANGED Viewed

@@ -1,128 +1,148 @@
-from dotenv import load_dotenv
-import streamlit as st
 from PyPDF2 import PdfReader
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.embeddings.huggingface import HuggingFaceEmbeddings
-from langchain.vectorstores import FAISS #facebook AI similarity search
-from langchain.chains.question_answering import load_qa_chain
-from langchain import HuggingFaceHub
 import docx
 import os
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
-from langchain_core.callbacks import StdOutCallbackHandler
-from streamlit_chat import message
-def main():
-    load_dotenv()
-    st.set_page_config(page_title="Ask your PDF")
-    st.header("Ask Your PDF")
-    if "conversation" not in st.session_state:
-        st.session_state.conversation = None
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = None
-    if "processComplete" not in st.session_state:
-        st.session_state.processComplete = None
-    with st.sidebar:
-        uploaded_files =  st.file_uploader("Upload your file",type=['pdf','docx'],accept_multiple_files=True)
-        process = st.button("Process")
-    # pdf = st.file_uploader("Upload your pdf",type="pdf")
-    if process:
-        files_text = get_files_text(uploaded_files)
-        # get text chunks
-        text_chunks = get_text_chunks(files_text)
-        # create vetore stores
-        vetorestore = get_vectorstore(text_chunks)
-         # create conversation chain
-        st.session_state.conversation = get_conversation_chain(vetorestore) #for openAI
-        # st.session_state.conversation = get_conversation_chain(vetorestore) #for huggingface
-        st.session_state.processComplete = True
-    if  st.session_state.processComplete == True:
-        user_question = st.chat_input("Ask Question about your files.")
-        if user_question:
-            handel_userinput(user_question)
-def get_files_text(uploaded_files):
     text = ""
-    for uploaded_file in uploaded_files:
-        split_tup = os.path.splitext(uploaded_file.name)
-        file_extension = split_tup[1]
-        if file_extension == ".pdf":
-            text += get_pdf_text(uploaded_file)
-        elif file_extension == ".docx":
-            text += get_docx_text(uploaded_file)
-        else:
-            text += get_csv_text(uploaded_file)
     return text
-def get_pdf_text(pdf):
-    pdf_reader = PdfReader(pdf)
-    text = ""
-    for page in pdf_reader.pages:
-        text += page.extract_text()
-    return text
-def get_docx_text(file):
-    doc = docx.Document(file)
-    allText = []
-    for docpara in doc.paragraphs:
-        allText.append(docpara.text)
-    text = ' '.join(allText)
     return text
-def get_csv_text(file):
-    return "a"
 def get_text_chunks(text):
-    # spilit ito chuncks
     text_splitter = CharacterTextSplitter(
         separator="\n",
-        chunk_size=900,
-        chunk_overlap=100,
         length_function=len
     )
-    chunks = text_splitter.split_text(text)
-    return chunks
 def get_vectorstore(text_chunks):
     embeddings = HuggingFaceEmbeddings()
-    knowledge_base = FAISS.from_texts(text_chunks,embeddings)
-    return knowledge_base
-def get_conversation_chain(vetorestore):
-    handler = StdOutCallbackHandler()
-    llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":5,"max_length":64})
-    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
-    conversation_chain = ConversationalRetrievalChain.from_llm(
         llm=llm,
-        retriever=vetorestore.as_retriever(),
-        memory=memory,
-        callbacks=[handler]
     )
-    return conversation_chain
-def handel_userinput(user_question):
-    response = st.session_state.conversation({'question':user_question})
-    st.session_state.chat_history = response['chat_history']
-    # Layout of input/response containers
-    response_container = st.container()
-    with response_container:
-        for i, messages in enumerate(st.session_state.chat_history):
-            if i % 2 == 0:
-                message(messages.content, is_user=True, key=str(i))
-            else:
-                message(messages.content, key=str(i))
-if __name__ == '__main__':
-    main()

+import gradio as gr
 from PyPDF2 import PdfReader
 import docx
 import os
+from dotenv import load_dotenv
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
+from langchain_community.llms import HuggingFaceHub
+# Initialize global variables
+conversation = None
+chat_history = []
+process_complete = False
+def get_pdf_text(pdf_file):
+    """Extract text from PDF"""
+    reader = PdfReader(pdf_file)
     text = ""
+    for page in reader.pages:
+        text += page.extract_text() or ""
     return text
+def get_docx_text(docx_file):
+    """Extract text from DOCX"""
+    doc = docx.Document(docx_file)
+    return "\n".join([para.text for para in doc.paragraphs])
+def get_files_text(files):
+    """Process multiple files"""
+    text = ""
+    for file in files:
+        if file.name.endswith(".pdf"):
+            text += get_pdf_text(file)
+        elif file.name.endswith(".docx"):
+            text += get_docx_text(file)
     return text
 def get_text_chunks(text):
+    """Split text into chunks"""
     text_splitter = CharacterTextSplitter(
         separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
         length_function=len
     )
+    return text_splitter.split_text(text)
 def get_vectorstore(text_chunks):
+    """Create vector store from text"""
     embeddings = HuggingFaceEmbeddings()
+    return FAISS.from_texts(text_chunks, embeddings)
+def get_conversation_chain(vectorstore):
+    """Initialize conversation chain"""
+    llm = HuggingFaceHub(
+        repo_id="google/flan-t5-large",
+        model_kwargs={"temperature": 0.5, "max_length": 512}
+    )
+    memory = ConversationBufferMemory(
+        memory_key='chat_history',
+        return_messages=True
+    )
+    return ConversationalRetrievalChain.from_llm(
         llm=llm,
+        retriever=vectorstore.as_retriever(),
+        memory=memory
     )
+def process_files(files):
+    """Handle file processing"""
+    global conversation, process_complete
+    if not files:
+        return "Please upload files first"
+    try:
+        raw_text = get_files_text(files)
+        text_chunks = get_text_chunks(raw_text)
+        vectorstore = get_vectorstore(text_chunks)
+        conversation = get_conversation_chain(vectorstore)
+        process_complete = True
+        return "✅ Files processed successfully! You can now ask questions."
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+def ask_question(question, history):
+    """Handle question answering"""
+    global conversation, chat_history
+    if not process_complete:
+        return history + [(question, "Please process files first")]
+    if not question:
+        return history
+    try:
+        response = conversation({"question": question})
+        answer = response["answer"]
+        chat_history = response["chat_history"]
+        return history + [(question, answer)]
+    except Exception as e:
+        return history + [(question, f"Error: {str(e)}")]
+# Gradio Interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📄 PDF/DOCX Chatbot")
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_input = gr.File(
+                label="Upload Files",
+                file_types=[".pdf", ".docx"],
+                file_count="multiple"
+            )
+            process_btn = gr.Button("Process Files")
+            status = gr.Textbox(label="Status")
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Conversation")
+            question = gr.Textbox(
+                label="Your Question",
+                placeholder="Ask about your documents..."
+            )
+            submit_btn = gr.Button("Submit")
+    # Event handlers
+    process_btn.click(
+        process_files,
+        inputs=file_input,
+        outputs=status
+    )
+    submit_btn.click(
+        ask_question,
+        inputs=[question, chatbot],
+        outputs=[chatbot]
+    )
+    question.submit(
+        ask_question,
+        inputs=[question, chatbot],
+        outputs=[chatbot]
+    )
+if __name__ == "__main__":
+    load_dotenv()
+    demo.launch()