Gowthamvemula commited on
Commit
f3eb4c3
·
verified ·
1 Parent(s): 856b1ea

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +33 -33
src/streamlit_app.py CHANGED
@@ -55,31 +55,31 @@ def init_database():
55
  conn.commit()
56
  conn.close()
57
 
58
- # Process uploaded files
59
  @st.cache_resource
60
  def process_documents(_uploaded_files, manual_text=""):
61
  init_database()
62
  conn = sqlite3.connect('itc_finance.db')
63
  cursor = conn.cursor()
64
-
65
  text_splitter = RecursiveCharacterTextSplitter(
66
  chunk_size=1000,
67
  chunk_overlap=200
68
  )
69
-
70
  chroma_db = Chroma(
71
  embedding_function=sentence_transformer_embedding,
72
  persist_directory="./chroma_db"
73
  )
74
-
75
  documents = []
76
-
77
  # Process uploaded files
78
  for uploaded_file in _uploaded_files:
79
  file_path = f"./temp_{uploaded_file.name}"
80
  with open(file_path, "wb") as f:
81
  f.write(uploaded_file.getbuffer())
82
-
83
  if uploaded_file.name.endswith('.pdf'):
84
  loader = PyPDFLoader(file_path)
85
  pages = loader.load_and_split()
@@ -87,7 +87,7 @@ def process_documents(_uploaded_files, manual_text=""):
87
  with open(file_path, 'r') as f:
88
  text = f.read()
89
  pages = [Document(page_content=text)]
90
-
91
  for page in pages:
92
  chunks = text_splitter.split_text(page.page_content)
93
  for chunk in chunks:
@@ -96,20 +96,20 @@ def process_documents(_uploaded_files, manual_text=""):
96
  (uploaded_file.name, chunk)
97
  )
98
  doc_id = cursor.lastrowid
99
-
100
  chroma_db.add_texts(
101
  texts=[chunk],
102
  metadatas=[{"source": uploaded_file.name, "sql_id": doc_id}]
103
  )
104
-
105
  cursor.execute(
106
  "UPDATE documents SET embedding_id = ? WHERE id = ?",
107
- (str(doc_id), doc_id
108
  )
109
-
110
  os.remove(file_path)
111
  documents.append(uploaded_file.name)
112
-
113
  # Process manual text
114
  if manual_text:
115
  chunks = text_splitter.split_text(manual_text)
@@ -119,18 +119,18 @@ def process_documents(_uploaded_files, manual_text=""):
119
  ("Manual Input", chunk)
120
  )
121
  doc_id = cursor.lastrowid
122
-
123
  chroma_db.add_texts(
124
  texts=[chunk],
125
  metadatas=[{"source": "Manual Input", "sql_id": doc_id}]
126
  )
127
-
128
  cursor.execute(
129
  "UPDATE documents SET embedding_id = ? WHERE id = ?",
130
- (str(doc_id), doc_id
131
  )
132
  documents.append("Manual Input")
133
-
134
  conn.commit()
135
  conn.close()
136
  return chroma_db, documents
@@ -143,20 +143,20 @@ def get_query_engine(chroma_db):
143
  Use only the provided context to answer.
144
  Cite sources like: [Source: {source}, page X]
145
  <</SYS>>
146
-
147
  Context: {context}
148
-
149
  Question: {question}[/INST]
150
  """)
151
-
152
  def format_docs(docs):
153
  return "\n\n".join(
154
  f"Document Excerpt: {doc.page_content}\nSource: {doc.metadata['source']}"
155
  for doc in docs
156
  )
157
-
158
  retriever = chroma_db.as_retriever(search_kwargs={"k": 3})
159
-
160
  return (
161
  {
162
  "context": retriever | format_docs,
@@ -171,21 +171,21 @@ def get_query_engine(chroma_db):
171
  if uploaded_files or manual_text:
172
  with st.spinner("Processing documents..."):
173
  chroma_db, processed_docs = process_documents(uploaded_files, manual_text)
174
-
175
  st.success(f"Processed {len(processed_docs)} documents")
176
  query_engine = get_query_engine(chroma_db)
177
-
178
  # Query interface
179
  st.divider()
180
  question = st.text_input("Ask about ITC's finances:", placeholder="E.g. What was the revenue growth in 2023?")
181
-
182
  if question:
183
  with st.spinner("Analyzing..."):
184
  answer = query_engine.invoke({"question": question})
185
-
186
  st.subheader("Analysis Result")
187
  st.markdown(answer)
188
-
189
  with st.expander("View source documents"):
190
  st.write(chroma_db.similarity_search(question))
191
  else:
@@ -195,17 +195,17 @@ else:
195
  with st.sidebar:
196
  st.markdown("## How to Use")
197
  st.markdown("""
198
- 1. Upload PDF reports/presentations
199
- 2. Or paste financial text
200
  3. Ask questions about the data
201
  """)
202
-
203
  st.markdown("## Sample Questions")
204
  st.markdown("""
205
- - What was ITC's net profit in 2023?
206
- - Compare revenue between 2022-2024
207
  - Show me key financial ratios
208
  """)
209
-
210
  st.markdown("## System Info")
211
- st.code(f"Using: Llama 3 (local)\nEmbeddings: all-MiniLM-L6-v2")
 
55
  conn.commit()
56
  conn.close()
57
 
58
+ # Process uploaded files and text
59
  @st.cache_resource
60
  def process_documents(_uploaded_files, manual_text=""):
61
  init_database()
62
  conn = sqlite3.connect('itc_finance.db')
63
  cursor = conn.cursor()
64
+
65
  text_splitter = RecursiveCharacterTextSplitter(
66
  chunk_size=1000,
67
  chunk_overlap=200
68
  )
69
+
70
  chroma_db = Chroma(
71
  embedding_function=sentence_transformer_embedding,
72
  persist_directory="./chroma_db"
73
  )
74
+
75
  documents = []
76
+
77
  # Process uploaded files
78
  for uploaded_file in _uploaded_files:
79
  file_path = f"./temp_{uploaded_file.name}"
80
  with open(file_path, "wb") as f:
81
  f.write(uploaded_file.getbuffer())
82
+
83
  if uploaded_file.name.endswith('.pdf'):
84
  loader = PyPDFLoader(file_path)
85
  pages = loader.load_and_split()
 
87
  with open(file_path, 'r') as f:
88
  text = f.read()
89
  pages = [Document(page_content=text)]
90
+
91
  for page in pages:
92
  chunks = text_splitter.split_text(page.page_content)
93
  for chunk in chunks:
 
96
  (uploaded_file.name, chunk)
97
  )
98
  doc_id = cursor.lastrowid
99
+
100
  chroma_db.add_texts(
101
  texts=[chunk],
102
  metadatas=[{"source": uploaded_file.name, "sql_id": doc_id}]
103
  )
104
+
105
  cursor.execute(
106
  "UPDATE documents SET embedding_id = ? WHERE id = ?",
107
+ (str(doc_id), doc_id)
108
  )
109
+
110
  os.remove(file_path)
111
  documents.append(uploaded_file.name)
112
+
113
  # Process manual text
114
  if manual_text:
115
  chunks = text_splitter.split_text(manual_text)
 
119
  ("Manual Input", chunk)
120
  )
121
  doc_id = cursor.lastrowid
122
+
123
  chroma_db.add_texts(
124
  texts=[chunk],
125
  metadatas=[{"source": "Manual Input", "sql_id": doc_id}]
126
  )
127
+
128
  cursor.execute(
129
  "UPDATE documents SET embedding_id = ? WHERE id = ?",
130
+ (str(doc_id), doc_id)
131
  )
132
  documents.append("Manual Input")
133
+
134
  conn.commit()
135
  conn.close()
136
  return chroma_db, documents
 
143
  Use only the provided context to answer.
144
  Cite sources like: [Source: {source}, page X]
145
  <</SYS>>
146
+
147
  Context: {context}
148
+
149
  Question: {question}[/INST]
150
  """)
151
+
152
  def format_docs(docs):
153
  return "\n\n".join(
154
  f"Document Excerpt: {doc.page_content}\nSource: {doc.metadata['source']}"
155
  for doc in docs
156
  )
157
+
158
  retriever = chroma_db.as_retriever(search_kwargs={"k": 3})
159
+
160
  return (
161
  {
162
  "context": retriever | format_docs,
 
171
  if uploaded_files or manual_text:
172
  with st.spinner("Processing documents..."):
173
  chroma_db, processed_docs = process_documents(uploaded_files, manual_text)
174
+
175
  st.success(f"Processed {len(processed_docs)} documents")
176
  query_engine = get_query_engine(chroma_db)
177
+
178
  # Query interface
179
  st.divider()
180
  question = st.text_input("Ask about ITC's finances:", placeholder="E.g. What was the revenue growth in 2023?")
181
+
182
  if question:
183
  with st.spinner("Analyzing..."):
184
  answer = query_engine.invoke({"question": question})
185
+
186
  st.subheader("Analysis Result")
187
  st.markdown(answer)
188
+
189
  with st.expander("View source documents"):
190
  st.write(chroma_db.similarity_search(question))
191
  else:
 
195
  with st.sidebar:
196
  st.markdown("## How to Use")
197
  st.markdown("""
198
+ 1. Upload PDF reports/presentations
199
+ 2. Or paste financial text
200
  3. Ask questions about the data
201
  """)
202
+
203
  st.markdown("## Sample Questions")
204
  st.markdown("""
205
+ - What was ITC's net profit in 2023?
206
+ - Compare revenue between 20222024
207
  - Show me key financial ratios
208
  """)
209
+
210
  st.markdown("## System Info")
211
+ st.code("Using: Llama 3 (local)\nEmbeddings: all-MiniLM-L6-v2")