Gowthamvemula commited on
Commit
009a93d
·
verified ·
1 Parent(s): a866ebf

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +32 -33
src/streamlit_app.py CHANGED
@@ -14,7 +14,7 @@ from langchain.docstore.document import Document
14
  @st.cache_resource
15
  def load_models():
16
  llm = Ollama(model="llama3")
17
- sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
18
  return llm, sentence_transformer
19
 
20
  llm, sentence_transformer = load_models()
@@ -55,31 +55,31 @@ def init_database():
55
  conn.commit()
56
  conn.close()
57
 
58
- # Process uploaded files and text
59
  @st.cache_resource
60
  def process_documents(_uploaded_files, manual_text=""):
61
  init_database()
62
  conn = sqlite3.connect('itc_finance.db')
63
  cursor = conn.cursor()
64
-
65
  text_splitter = RecursiveCharacterTextSplitter(
66
  chunk_size=1000,
67
  chunk_overlap=200
68
  )
69
-
70
  chroma_db = Chroma(
71
  embedding_function=sentence_transformer_embedding,
72
  persist_directory="./chroma_db"
73
  )
74
-
75
  documents = []
76
-
77
  # Process uploaded files
78
  for uploaded_file in _uploaded_files:
79
  file_path = f"./temp_{uploaded_file.name}"
80
  with open(file_path, "wb") as f:
81
  f.write(uploaded_file.getbuffer())
82
-
83
  if uploaded_file.name.endswith('.pdf'):
84
  loader = PyPDFLoader(file_path)
85
  pages = loader.load_and_split()
@@ -87,7 +87,7 @@ def process_documents(_uploaded_files, manual_text=""):
87
  with open(file_path, 'r') as f:
88
  text = f.read()
89
  pages = [Document(page_content=text)]
90
-
91
  for page in pages:
92
  chunks = text_splitter.split_text(page.page_content)
93
  for chunk in chunks:
@@ -96,20 +96,19 @@ def process_documents(_uploaded_files, manual_text=""):
96
  (uploaded_file.name, chunk)
97
  )
98
  doc_id = cursor.lastrowid
99
-
100
  chroma_db.add_texts(
101
  texts=[chunk],
102
  metadatas=[{"source": uploaded_file.name, "sql_id": doc_id}]
103
  )
104
-
105
  cursor.execute(
106
  "UPDATE documents SET embedding_id = ? WHERE id = ?",
107
  (str(doc_id), doc_id)
108
- )
109
-
110
  os.remove(file_path)
111
  documents.append(uploaded_file.name)
112
-
113
  # Process manual text
114
  if manual_text:
115
  chunks = text_splitter.split_text(manual_text)
@@ -119,18 +118,18 @@ def process_documents(_uploaded_files, manual_text=""):
119
  ("Manual Input", chunk)
120
  )
121
  doc_id = cursor.lastrowid
122
-
123
  chroma_db.add_texts(
124
  texts=[chunk],
125
  metadatas=[{"source": "Manual Input", "sql_id": doc_id}]
126
  )
127
-
128
  cursor.execute(
129
  "UPDATE documents SET embedding_id = ? WHERE id = ?",
130
  (str(doc_id), doc_id)
131
  )
132
  documents.append("Manual Input")
133
-
134
  conn.commit()
135
  conn.close()
136
  return chroma_db, documents
@@ -143,20 +142,20 @@ def get_query_engine(chroma_db):
143
  Use only the provided context to answer.
144
  Cite sources like: [Source: {source}, page X]
145
  <</SYS>>
146
-
147
  Context: {context}
148
-
149
  Question: {question}[/INST]
150
  """)
151
-
152
  def format_docs(docs):
153
  return "\n\n".join(
154
  f"Document Excerpt: {doc.page_content}\nSource: {doc.metadata['source']}"
155
  for doc in docs
156
  )
157
-
158
  retriever = chroma_db.as_retriever(search_kwargs={"k": 3})
159
-
160
  return (
161
  {
162
  "context": retriever | format_docs,
@@ -171,21 +170,21 @@ def get_query_engine(chroma_db):
171
  if uploaded_files or manual_text:
172
  with st.spinner("Processing documents..."):
173
  chroma_db, processed_docs = process_documents(uploaded_files, manual_text)
174
-
175
  st.success(f"Processed {len(processed_docs)} documents")
176
  query_engine = get_query_engine(chroma_db)
177
-
178
  # Query interface
179
  st.divider()
180
  question = st.text_input("Ask about ITC's finances:", placeholder="E.g. What was the revenue growth in 2023?")
181
-
182
  if question:
183
  with st.spinner("Analyzing..."):
184
  answer = query_engine.invoke({"question": question})
185
-
186
  st.subheader("Analysis Result")
187
  st.markdown(answer)
188
-
189
  with st.expander("View source documents"):
190
  st.write(chroma_db.similarity_search(question))
191
  else:
@@ -195,17 +194,17 @@ else:
195
  with st.sidebar:
196
  st.markdown("## How to Use")
197
  st.markdown("""
198
- 1. Upload PDF reports/presentations
199
- 2. Or paste financial text
200
  3. Ask questions about the data
201
  """)
202
-
203
  st.markdown("## Sample Questions")
204
  st.markdown("""
205
- - What was ITC's net profit in 2023?
206
- - Compare revenue between 20222024
207
  - Show me key financial ratios
208
  """)
209
-
210
  st.markdown("## System Info")
211
- st.code("Using: Llama 3 (local)\nEmbeddings: all-MiniLM-L6-v2")
 
14
  @st.cache_resource
15
  def load_models():
16
  llm = Ollama(model="llama3")
17
+ sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
18
  return llm, sentence_transformer
19
 
20
  llm, sentence_transformer = load_models()
 
55
  conn.commit()
56
  conn.close()
57
 
58
+ # Process uploaded files
59
  @st.cache_resource
60
  def process_documents(_uploaded_files, manual_text=""):
61
  init_database()
62
  conn = sqlite3.connect('itc_finance.db')
63
  cursor = conn.cursor()
64
+
65
  text_splitter = RecursiveCharacterTextSplitter(
66
  chunk_size=1000,
67
  chunk_overlap=200
68
  )
69
+
70
  chroma_db = Chroma(
71
  embedding_function=sentence_transformer_embedding,
72
  persist_directory="./chroma_db"
73
  )
74
+
75
  documents = []
76
+
77
  # Process uploaded files
78
  for uploaded_file in _uploaded_files:
79
  file_path = f"./temp_{uploaded_file.name}"
80
  with open(file_path, "wb") as f:
81
  f.write(uploaded_file.getbuffer())
82
+
83
  if uploaded_file.name.endswith('.pdf'):
84
  loader = PyPDFLoader(file_path)
85
  pages = loader.load_and_split()
 
87
  with open(file_path, 'r') as f:
88
  text = f.read()
89
  pages = [Document(page_content=text)]
90
+
91
  for page in pages:
92
  chunks = text_splitter.split_text(page.page_content)
93
  for chunk in chunks:
 
96
  (uploaded_file.name, chunk)
97
  )
98
  doc_id = cursor.lastrowid
99
+
100
  chroma_db.add_texts(
101
  texts=[chunk],
102
  metadatas=[{"source": uploaded_file.name, "sql_id": doc_id}]
103
  )
104
+
105
  cursor.execute(
106
  "UPDATE documents SET embedding_id = ? WHERE id = ?",
107
  (str(doc_id), doc_id)
108
+
 
109
  os.remove(file_path)
110
  documents.append(uploaded_file.name)
111
+
112
  # Process manual text
113
  if manual_text:
114
  chunks = text_splitter.split_text(manual_text)
 
118
  ("Manual Input", chunk)
119
  )
120
  doc_id = cursor.lastrowid
121
+
122
  chroma_db.add_texts(
123
  texts=[chunk],
124
  metadatas=[{"source": "Manual Input", "sql_id": doc_id}]
125
  )
126
+
127
  cursor.execute(
128
  "UPDATE documents SET embedding_id = ? WHERE id = ?",
129
  (str(doc_id), doc_id)
130
  )
131
  documents.append("Manual Input")
132
+
133
  conn.commit()
134
  conn.close()
135
  return chroma_db, documents
 
142
  Use only the provided context to answer.
143
  Cite sources like: [Source: {source}, page X]
144
  <</SYS>>
145
+
146
  Context: {context}
147
+
148
  Question: {question}[/INST]
149
  """)
150
+
151
  def format_docs(docs):
152
  return "\n\n".join(
153
  f"Document Excerpt: {doc.page_content}\nSource: {doc.metadata['source']}"
154
  for doc in docs
155
  )
156
+
157
  retriever = chroma_db.as_retriever(search_kwargs={"k": 3})
158
+
159
  return (
160
  {
161
  "context": retriever | format_docs,
 
170
  if uploaded_files or manual_text:
171
  with st.spinner("Processing documents..."):
172
  chroma_db, processed_docs = process_documents(uploaded_files, manual_text)
173
+
174
  st.success(f"Processed {len(processed_docs)} documents")
175
  query_engine = get_query_engine(chroma_db)
176
+
177
  # Query interface
178
  st.divider()
179
  question = st.text_input("Ask about ITC's finances:", placeholder="E.g. What was the revenue growth in 2023?")
180
+
181
  if question:
182
  with st.spinner("Analyzing..."):
183
  answer = query_engine.invoke({"question": question})
184
+
185
  st.subheader("Analysis Result")
186
  st.markdown(answer)
187
+
188
  with st.expander("View source documents"):
189
  st.write(chroma_db.similarity_search(question))
190
  else:
 
194
  with st.sidebar:
195
  st.markdown("## How to Use")
196
  st.markdown("""
197
+ 1. Upload PDF reports/presentations
198
+ 2. Or paste financial text
199
  3. Ask questions about the data
200
  """)
201
+
202
  st.markdown("## Sample Questions")
203
  st.markdown("""
204
+ - What was ITC's net profit in 2023?
205
+ - Compare revenue between 2022-2024
206
  - Show me key financial ratios
207
  """)
208
+
209
  st.markdown("## System Info")
210
+ st.code(f"Using: Llama 3 (local)\nEmbeddings: sentence-transformers/all-MiniLM-L6-v2")