Himanshu kumar Vishwakrma commited on
Commit
074614d
·
1 Parent(s): e46a2d5

Streamlit to the gradio

Browse files
Files changed (1) hide show
  1. app.py +122 -102
app.py CHANGED
@@ -1,128 +1,148 @@
1
- from dotenv import load_dotenv
2
- import streamlit as st
3
  from PyPDF2 import PdfReader
4
- from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings.huggingface import HuggingFaceEmbeddings
6
- from langchain.vectorstores import FAISS #facebook AI similarity search
7
- from langchain.chains.question_answering import load_qa_chain
8
- from langchain import HuggingFaceHub
9
  import docx
10
  import os
 
 
 
 
11
  from langchain.chains import ConversationalRetrievalChain
12
  from langchain.memory import ConversationBufferMemory
13
- from langchain_core.callbacks import StdOutCallbackHandler
14
- from streamlit_chat import message
15
-
16
-
17
- def main():
18
- load_dotenv()
19
- st.set_page_config(page_title="Ask your PDF")
20
- st.header("Ask Your PDF")
21
 
22
- if "conversation" not in st.session_state:
23
- st.session_state.conversation = None
24
- if "chat_history" not in st.session_state:
25
- st.session_state.chat_history = None
26
- if "processComplete" not in st.session_state:
27
- st.session_state.processComplete = None
28
 
29
- with st.sidebar:
30
- uploaded_files = st.file_uploader("Upload your file",type=['pdf','docx'],accept_multiple_files=True)
31
- process = st.button("Process")
32
-
33
- # pdf = st.file_uploader("Upload your pdf",type="pdf")
34
-
35
- if process:
36
- files_text = get_files_text(uploaded_files)
37
- # get text chunks
38
- text_chunks = get_text_chunks(files_text)
39
- # create vetore stores
40
- vetorestore = get_vectorstore(text_chunks)
41
- # create conversation chain
42
- st.session_state.conversation = get_conversation_chain(vetorestore) #for openAI
43
- # st.session_state.conversation = get_conversation_chain(vetorestore) #for huggingface
44
-
45
- st.session_state.processComplete = True
46
-
47
- if st.session_state.processComplete == True:
48
- user_question = st.chat_input("Ask Question about your files.")
49
- if user_question:
50
- handel_userinput(user_question)
51
-
52
- def get_files_text(uploaded_files):
53
  text = ""
54
- for uploaded_file in uploaded_files:
55
- split_tup = os.path.splitext(uploaded_file.name)
56
- file_extension = split_tup[1]
57
- if file_extension == ".pdf":
58
- text += get_pdf_text(uploaded_file)
59
- elif file_extension == ".docx":
60
- text += get_docx_text(uploaded_file)
61
- else:
62
- text += get_csv_text(uploaded_file)
63
  return text
64
 
65
- def get_pdf_text(pdf):
66
- pdf_reader = PdfReader(pdf)
67
- text = ""
68
- for page in pdf_reader.pages:
69
- text += page.extract_text()
70
- return text
71
 
72
- def get_docx_text(file):
73
- doc = docx.Document(file)
74
- allText = []
75
- for docpara in doc.paragraphs:
76
- allText.append(docpara.text)
77
- text = ' '.join(allText)
 
 
78
  return text
79
 
80
- def get_csv_text(file):
81
- return "a"
82
-
83
  def get_text_chunks(text):
84
- # spilit ito chuncks
85
  text_splitter = CharacterTextSplitter(
86
  separator="\n",
87
- chunk_size=900,
88
- chunk_overlap=100,
89
  length_function=len
90
  )
91
- chunks = text_splitter.split_text(text)
92
- return chunks
93
-
94
 
95
  def get_vectorstore(text_chunks):
 
96
  embeddings = HuggingFaceEmbeddings()
97
- knowledge_base = FAISS.from_texts(text_chunks,embeddings)
98
- return knowledge_base
99
 
100
- def get_conversation_chain(vetorestore):
101
- handler = StdOutCallbackHandler()
102
- llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":5,"max_length":64})
103
- memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
104
- conversation_chain = ConversationalRetrievalChain.from_llm(
 
 
 
 
 
 
105
  llm=llm,
106
- retriever=vetorestore.as_retriever(),
107
- memory=memory,
108
- callbacks=[handler]
109
  )
110
- return conversation_chain
111
-
112
 
113
- def handel_userinput(user_question):
114
- response = st.session_state.conversation({'question':user_question})
115
- st.session_state.chat_history = response['chat_history']
116
-
117
- # Layout of input/response containers
118
- response_container = st.container()
119
-
120
- with response_container:
121
- for i, messages in enumerate(st.session_state.chat_history):
122
- if i % 2 == 0:
123
- message(messages.content, is_user=True, key=str(i))
124
- else:
125
- message(messages.content, key=str(i))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- if __name__ == '__main__':
128
- main()
 
 
1
+ import gradio as gr
 
2
  from PyPDF2 import PdfReader
 
 
 
 
 
3
  import docx
4
  import os
5
+ from dotenv import load_dotenv
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
+ from langchain.vectorstores import FAISS
9
  from langchain.chains import ConversationalRetrievalChain
10
  from langchain.memory import ConversationBufferMemory
11
+ from langchain_community.llms import HuggingFaceHub
 
 
 
 
 
 
 
12
 
13
+ # Initialize global variables
14
+ conversation = None
15
+ chat_history = []
16
+ process_complete = False
 
 
17
 
18
+ def get_pdf_text(pdf_file):
19
+ """Extract text from PDF"""
20
+ reader = PdfReader(pdf_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  text = ""
22
+ for page in reader.pages:
23
+ text += page.extract_text() or ""
 
 
 
 
 
 
 
24
  return text
25
 
26
+ def get_docx_text(docx_file):
27
+ """Extract text from DOCX"""
28
+ doc = docx.Document(docx_file)
29
+ return "\n".join([para.text for para in doc.paragraphs])
 
 
30
 
31
+ def get_files_text(files):
32
+ """Process multiple files"""
33
+ text = ""
34
+ for file in files:
35
+ if file.name.endswith(".pdf"):
36
+ text += get_pdf_text(file)
37
+ elif file.name.endswith(".docx"):
38
+ text += get_docx_text(file)
39
  return text
40
 
 
 
 
41
  def get_text_chunks(text):
42
+ """Split text into chunks"""
43
  text_splitter = CharacterTextSplitter(
44
  separator="\n",
45
+ chunk_size=1000,
46
+ chunk_overlap=200,
47
  length_function=len
48
  )
49
+ return text_splitter.split_text(text)
 
 
50
 
51
  def get_vectorstore(text_chunks):
52
+ """Create vector store from text"""
53
  embeddings = HuggingFaceEmbeddings()
54
+ return FAISS.from_texts(text_chunks, embeddings)
 
55
 
56
+ def get_conversation_chain(vectorstore):
57
+ """Initialize conversation chain"""
58
+ llm = HuggingFaceHub(
59
+ repo_id="google/flan-t5-large",
60
+ model_kwargs={"temperature": 0.5, "max_length": 512}
61
+ )
62
+ memory = ConversationBufferMemory(
63
+ memory_key='chat_history',
64
+ return_messages=True
65
+ )
66
+ return ConversationalRetrievalChain.from_llm(
67
  llm=llm,
68
+ retriever=vectorstore.as_retriever(),
69
+ memory=memory
 
70
  )
 
 
71
 
72
+ def process_files(files):
73
+ """Handle file processing"""
74
+ global conversation, process_complete
75
+ if not files:
76
+ return "Please upload files first"
77
+
78
+ try:
79
+ raw_text = get_files_text(files)
80
+ text_chunks = get_text_chunks(raw_text)
81
+ vectorstore = get_vectorstore(text_chunks)
82
+ conversation = get_conversation_chain(vectorstore)
83
+ process_complete = True
84
+ return "✅ Files processed successfully! You can now ask questions."
85
+ except Exception as e:
86
+ return f"❌ Error: {str(e)}"
87
+
88
+ def ask_question(question, history):
89
+ """Handle question answering"""
90
+ global conversation, chat_history
91
+ if not process_complete:
92
+ return history + [(question, "Please process files first")]
93
+
94
+ if not question:
95
+ return history
96
+
97
+ try:
98
+ response = conversation({"question": question})
99
+ answer = response["answer"]
100
+ chat_history = response["chat_history"]
101
+ return history + [(question, answer)]
102
+ except Exception as e:
103
+ return history + [(question, f"Error: {str(e)}")]
104
+
105
+ # Gradio Interface
106
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
107
+ gr.Markdown("# 📄 PDF/DOCX Chatbot")
108
+
109
+ with gr.Row():
110
+ with gr.Column(scale=1):
111
+ file_input = gr.File(
112
+ label="Upload Files",
113
+ file_types=[".pdf", ".docx"],
114
+ file_count="multiple"
115
+ )
116
+ process_btn = gr.Button("Process Files")
117
+ status = gr.Textbox(label="Status")
118
+
119
+ with gr.Column(scale=2):
120
+ chatbot = gr.Chatbot(label="Conversation")
121
+ question = gr.Textbox(
122
+ label="Your Question",
123
+ placeholder="Ask about your documents..."
124
+ )
125
+ submit_btn = gr.Button("Submit")
126
+
127
+ # Event handlers
128
+ process_btn.click(
129
+ process_files,
130
+ inputs=file_input,
131
+ outputs=status
132
+ )
133
+
134
+ submit_btn.click(
135
+ ask_question,
136
+ inputs=[question, chatbot],
137
+ outputs=[chatbot]
138
+ )
139
+
140
+ question.submit(
141
+ ask_question,
142
+ inputs=[question, chatbot],
143
+ outputs=[chatbot]
144
+ )
145
 
146
+ if __name__ == "__main__":
147
+ load_dotenv()
148
+ demo.launch()