himanshukumar378 commited on
Commit
55a2078
·
verified ·
1 Parent(s): 4add137

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +118 -106
src/streamlit_app.py CHANGED
@@ -1,116 +1,128 @@
 
1
  import streamlit as st
2
- from pypdf import PdfReader
3
- from docx import Document
4
- import chromadb
5
- from chromadb.utils import embedding_functions
6
- from huggingface_hub import InferenceClient
7
- import time
 
8
  import os
 
 
 
 
9
 
10
- # Initialize ChromaDB (ephemeral for HF Spaces)
11
- client = chromadb.EphemeralClient()
12
- sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
13
- model_name="all-MiniLM-L6-v2"
14
- )
15
- collection = client.get_or_create_collection(
16
- name="documents",
17
- embedding_function=sentence_transformer_ef
18
- )
19
-
20
- # Initialize HF Inference Client
21
- hf_client = InferenceClient(model="google/gemma-2b-it")
22
-
23
- def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
24
- chunks = []
25
- start = 0
26
- while start < len(text):
27
- end = min(start + chunk_size, len(text))
28
- if end < len(text):
29
- while end > start and text[end] not in {'.', '!', '?', '\n'}:
30
- end -= 1
31
- if end == start:
32
- end = start + chunk_size
33
- chunks.append(text[start:end].strip())
34
- start = end
35
- return chunks
36
 
37
- def process_document(uploaded_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  text = ""
39
- if uploaded_file.type == "application/pdf":
40
- reader = PdfReader(uploaded_file)
41
- text = "\n".join([page.extract_text() for page in reader.pages])
42
- elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
43
- doc = Document(uploaded_file)
44
- text = "\n".join([para.text for para in doc.paragraphs])
45
- elif uploaded_file.type == "text/plain":
46
- text = str(uploaded_file.read(), "utf-8")
47
-
48
- chunks = chunk_text(text)
49
- ids = [f"{uploaded_file.name}-{i}" for i in range(len(chunks))]
50
- collection.add(
51
- documents=chunks,
52
- ids=ids,
53
- metadatas=[{"source": uploaded_file.name} for _ in chunks]
54
- )
55
- return len(chunks)
56
 
57
- def retrieve_chunks(query: str, k: int = 3) -> Tuple[List[str], List[str]]:
58
- results = collection.query(
59
- query_texts=[query],
60
- n_results=k
61
- )
62
- return results['documents'][0], results['metadatas'][0]
63
-
64
- def generate_response(query: str, context: str) -> str:
65
- prompt = f"""Context: {context}\n\nQuestion: {query}\nAnswer:"""
66
- return hf_client.text_generation(
67
- prompt,
68
- max_new_tokens=512,
69
- temperature=0.7
 
 
 
 
 
 
 
 
 
 
 
 
70
  )
 
 
 
71
 
72
- # Streamlit UI
73
- st.title("📄 Document Q&A Assistant")
 
 
74
 
75
- with st.sidebar:
76
- st.header("Upload Documents")
77
- uploaded_files = st.file_uploader(
78
- "Choose files",
79
- type=["pdf", "docx", "txt"],
80
- accept_multiple_files=True
 
 
 
81
  )
82
-
83
- if uploaded_files:
84
- with st.spinner("Processing documents..."):
85
- for file in uploaded_files:
86
- chunks = process_document(file)
87
- st.success(f"Processed {file.name} into {chunks} chunks")
88
-
89
- if "messages" not in st.session_state:
90
- st.session_state.messages = []
91
-
92
- for message in st.session_state.messages:
93
- with st.chat_message(message["role"]):
94
- st.markdown(message["content"])
95
-
96
- if prompt := st.chat_input("Ask about your documents"):
97
- st.session_state.messages.append({"role": "user", "content": prompt})
98
-
99
- with st.chat_message("user"):
100
- st.markdown(prompt)
101
-
102
- with st.chat_message("assistant"):
103
- with st.spinner("Searching documents..."):
104
- chunks, metadata = retrieve_chunks(prompt)
105
- context = "\n\n".join(chunks)
106
-
107
- with st.spinner("Generating response..."):
108
- response = generate_response(prompt, context)
109
- sources = list(set([m['source'] for m in metadata]))
110
-
111
- if sources:
112
- response += f"\n\nSources: {', '.join(sources)}"
113
-
114
- st.markdown(response)
115
-
116
- st.session_state.messages.append({"role": "assistant", "content": response})
 
1
+ from dotenv import load_dotenv
2
  import streamlit as st
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import FAISS #facebook AI similarity search
7
+ from langchain.chains.question_answering import load_qa_chain
8
+ from langchain import HuggingFaceHub
9
+ import docx
10
  import os
11
+ from langchain.chains import ConversationalRetrievalChain
12
+ from langchain.memory import ConversationBufferMemory
13
+ from langchain_core.callbacks import StdOutCallbackHandler
14
+ from streamlit_chat import message
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def main():
18
+ load_dotenv()
19
+ st.set_page_config(page_title="Ask your PDF")
20
+ st.header("Ask Your PDF")
21
+
22
+ if "conversation" not in st.session_state:
23
+ st.session_state.conversation = None
24
+ if "chat_history" not in st.session_state:
25
+ st.session_state.chat_history = None
26
+ if "processComplete" not in st.session_state:
27
+ st.session_state.processComplete = None
28
+
29
+ with st.sidebar:
30
+ uploaded_files = st.file_uploader("Upload your file",type=['pdf','docx'],accept_multiple_files=True)
31
+ process = st.button("Process")
32
+
33
+ # pdf = st.file_uploader("Upload your pdf",type="pdf")
34
+
35
+ if process:
36
+ files_text = get_files_text(uploaded_files)
37
+ # get text chunks
38
+ text_chunks = get_text_chunks(files_text)
39
+ # create vetore stores
40
+ vetorestore = get_vectorstore(text_chunks)
41
+ # create conversation chain
42
+ st.session_state.conversation = get_conversation_chain(vetorestore) #for openAI
43
+ # st.session_state.conversation = get_conversation_chain(vetorestore) #for huggingface
44
+
45
+ st.session_state.processComplete = True
46
+
47
+ if st.session_state.processComplete == True:
48
+ user_question = st.chat_input("Ask Question about your files.")
49
+ if user_question:
50
+ handel_userinput(user_question)
51
+
52
+ def get_files_text(uploaded_files):
53
  text = ""
54
+ for uploaded_file in uploaded_files:
55
+ split_tup = os.path.splitext(uploaded_file.name)
56
+ file_extension = split_tup[1]
57
+ if file_extension == ".pdf":
58
+ text += get_pdf_text(uploaded_file)
59
+ elif file_extension == ".docx":
60
+ text += get_docx_text(uploaded_file)
61
+ else:
62
+ text += get_csv_text(uploaded_file)
63
+ return text
 
 
 
 
 
 
 
64
 
65
+ def get_pdf_text(pdf):
66
+ pdf_reader = PdfReader(pdf)
67
+ text = ""
68
+ for page in pdf_reader.pages:
69
+ text += page.extract_text()
70
+ return text
71
+
72
+ def get_docx_text(file):
73
+ doc = docx.Document(file)
74
+ allText = []
75
+ for docpara in doc.paragraphs:
76
+ allText.append(docpara.text)
77
+ text = ' '.join(allText)
78
+ return text
79
+
80
+ def get_csv_text(file):
81
+ return "a"
82
+
83
+ def get_text_chunks(text):
84
+ # spilit ito chuncks
85
+ text_splitter = CharacterTextSplitter(
86
+ separator="\n",
87
+ chunk_size=900,
88
+ chunk_overlap=100,
89
+ length_function=len
90
  )
91
+ chunks = text_splitter.split_text(text)
92
+ return chunks
93
+
94
 
95
+ def get_vectorstore(text_chunks):
96
+ embeddings = HuggingFaceEmbeddings()
97
+ knowledge_base = FAISS.from_texts(text_chunks,embeddings)
98
+ return knowledge_base
99
 
100
+ def get_conversation_chain(vetorestore):
101
+ handler = StdOutCallbackHandler()
102
+ llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":5,"max_length":64})
103
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
104
+ conversation_chain = ConversationalRetrievalChain.from_llm(
105
+ llm=llm,
106
+ retriever=vetorestore.as_retriever(),
107
+ memory=memory,
108
+ callbacks=[handler]
109
  )
110
+ return conversation_chain
111
+
112
+
113
+ def handel_userinput(user_question):
114
+ response = st.session_state.conversation({'question':user_question})
115
+ st.session_state.chat_history = response['chat_history']
116
+
117
+ # Layout of input/response containers
118
+ response_container = st.container()
119
+
120
+ with response_container:
121
+ for i, messages in enumerate(st.session_state.chat_history):
122
+ if i % 2 == 0:
123
+ message(messages.content, is_user=True, key=str(i))
124
+ else:
125
+ message(messages.content, key=str(i))
126
+
127
+ if __name__ == '__main__':
128
+ main()