MrSimple07 commited on
Commit
fb442dd
Β·
1 Parent(s): 8fefe0f

new app py + init data

Browse files
Files changed (2) hide show
  1. app.py +14 -46
  2. document_processor.py +66 -47
app.py CHANGED
@@ -6,6 +6,7 @@ from llama_index.core import Settings
6
  from config import *
7
  from document_processor import *
8
  from llama_index.core.chat_engine import CondensePlusContextChatEngine
 
9
 
10
  #new thing
11
  query_engine = None
@@ -22,7 +23,6 @@ def answer_question(question, history):
22
  try:
23
  start_time = time.time()
24
 
25
- # Initialize chat engine if not exists
26
  if chat_engine is None:
27
  chat_engine = CondensePlusContextChatEngine.from_defaults(
28
  retriever=query_engine.retriever,
@@ -45,17 +45,15 @@ def answer_question(question, history):
45
  </div>
46
  </div>"""
47
 
48
- # Update chat history (keep last 6 messages - 3 exchanges)
49
- new_history = history + [[question, response.response]]
50
- if len(new_history) > 3:
51
- new_history = new_history[-3:]
52
 
53
  return answer_with_time, sources_html, new_history
54
 
55
  except Exception as e:
56
  error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>❌ Error processing question: {str(e)}</div>"
57
  return error_msg, "", history
58
-
59
 
60
 
61
  def generate_sources_html(nodes):
@@ -164,7 +162,8 @@ def create_interface():
164
  chatbot = gr.Chatbot(
165
  label="Chat History",
166
  height=400,
167
- show_label=True
 
168
  )
169
 
170
  question_input = gr.Textbox(
@@ -227,39 +226,35 @@ def create_interface():
227
 
228
  refresh_btn = gr.Button("πŸ”„ Refresh List", variant="secondary")
229
 
230
- with gr.Column(scale=1, elem_id="upload-column"):
231
- gr.Markdown("#### Upload new documents", elem_classes=["upload-header"])
232
- gr.Markdown("Supported formats: PDF, TXT", elem_classes=["upload-info"])
233
 
234
  file_upload = gr.File(
235
  file_count="multiple",
236
  file_types=[".pdf", ".txt"],
237
- label="Select files to upload",
238
- elem_classes=["upload-file"]
239
  )
240
 
241
  doc_names_input = gr.Textbox(
242
  label="Document names (one per line)",
243
  placeholder="Enter document names, one per line...",
244
- lines=5,
245
- elem_classes=["upload-input"]
246
  )
247
 
248
  doc_links_input = gr.Textbox(
249
  label="Document links (one per line)",
250
  placeholder="Enter document links, one per line...",
251
- lines=5,
252
- elem_classes=["upload-input"]
253
  )
254
 
255
- upload_btn = gr.Button("πŸ“€ Upload and Process", variant="primary", elem_classes=["upload-btn"])
256
 
257
  upload_status = gr.Textbox(
258
  label="Upload status",
259
  lines=8,
260
  max_lines=10,
261
- interactive=False,
262
- elem_classes=["upload-status"]
263
  )
264
 
265
  def process_names_and_links(names_text, links_text):
@@ -281,35 +276,8 @@ def create_interface():
281
  outputs=[documents_display]
282
  )
283
 
284
- # Add CSS to fix white background in upload tab
285
- demo.css = """
286
- #upload-column {
287
- background-color: #f8f9fa !important;
288
- padding: 20px !important;
289
- border-radius: 10px !important;
290
- border: 1px solid #e9ecef !important;
291
- }
292
- .upload-header h4 {
293
- color: #2d3748 !important;
294
- margin-bottom: 10px !important;
295
- }
296
- .upload-info {
297
- color: #666 !important;
298
- margin-bottom: 15px !important;
299
- }
300
- .upload-file, .upload-input, .upload-status {
301
- background-color: white !important;
302
- border: 1px solid #ced4da !important;
303
- border-radius: 5px !important;
304
- }
305
- .upload-btn {
306
- margin-top: 10px !important;
307
- }
308
- """
309
-
310
  return demo
311
 
312
-
313
  if __name__ == "__main__":
314
  try:
315
  log_message("πŸš€ Starting AIEXP - AI Expert for Regulatory Documentation")
 
6
  from config import *
7
  from document_processor import *
8
  from llama_index.core.chat_engine import CondensePlusContextChatEngine
9
+ import faiss
10
 
11
  #new thing
12
  query_engine = None
 
23
  try:
24
  start_time = time.time()
25
 
 
26
  if chat_engine is None:
27
  chat_engine = CondensePlusContextChatEngine.from_defaults(
28
  retriever=query_engine.retriever,
 
45
  </div>
46
  </div>"""
47
 
48
+ new_history = history + [{"role": "user", "content": question}, {"role": "assistant", "content": response.response}]
49
+ if len(new_history) > 6:
50
+ new_history = new_history[-6:]
 
51
 
52
  return answer_with_time, sources_html, new_history
53
 
54
  except Exception as e:
55
  error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>❌ Error processing question: {str(e)}</div>"
56
  return error_msg, "", history
 
57
 
58
 
59
  def generate_sources_html(nodes):
 
162
  chatbot = gr.Chatbot(
163
  label="Chat History",
164
  height=400,
165
+ show_label=True,
166
+ type="messages"
167
  )
168
 
169
  question_input = gr.Textbox(
 
226
 
227
  refresh_btn = gr.Button("πŸ”„ Refresh List", variant="secondary")
228
 
229
+ with gr.Column(scale=1):
230
+ gr.Markdown("#### Upload new documents")
231
+ gr.Markdown("Supported formats: PDF, TXT")
232
 
233
  file_upload = gr.File(
234
  file_count="multiple",
235
  file_types=[".pdf", ".txt"],
236
+ label="Select files to upload"
 
237
  )
238
 
239
  doc_names_input = gr.Textbox(
240
  label="Document names (one per line)",
241
  placeholder="Enter document names, one per line...",
242
+ lines=5
 
243
  )
244
 
245
  doc_links_input = gr.Textbox(
246
  label="Document links (one per line)",
247
  placeholder="Enter document links, one per line...",
248
+ lines=5
 
249
  )
250
 
251
+ upload_btn = gr.Button("πŸ“€ Upload and Process", variant="primary")
252
 
253
  upload_status = gr.Textbox(
254
  label="Upload status",
255
  lines=8,
256
  max_lines=10,
257
+ interactive=False
 
258
  )
259
 
260
  def process_names_and_links(names_text, links_text):
 
276
  outputs=[documents_display]
277
  )
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  return demo
280
 
 
281
  if __name__ == "__main__":
282
  try:
283
  log_message("πŸš€ Starting AIEXP - AI Expert for Regulatory Documentation")
document_processor.py CHANGED
@@ -10,6 +10,9 @@ from llama_index.core.response_synthesizers import get_response_synthesizer, Res
10
  from llama_index.core.prompts import PromptTemplate
11
  from config import *
12
  import shutil
 
 
 
13
 
14
  def log_message(message):
15
  print(message, flush=True)
@@ -184,61 +187,77 @@ def add_to_vector_index(new_chunks, file_info, existing_chunks_df=None):
184
  return None, existing_chunks_df, str(e)
185
 
186
  def initialize_system():
 
 
187
  try:
188
- log_message("πŸ”„ Initializing AI Expert system...")
189
  os.makedirs(download_dir, exist_ok=True)
190
 
191
- chunks_df = None
192
- try:
193
- chunks_csv_path = os.path.join(download_dir, chunks_filename)
194
- if os.path.exists(chunks_csv_path):
195
- log_message("πŸ“‚ Loading existing chunks...")
196
- chunks_df = pd.read_csv(chunks_csv_path)
197
- else:
198
- log_message("πŸ“ Creating empty chunks database...")
199
- chunks_df = pd.DataFrame(columns=['chunk_id', 'document_id', 'document_name', 'document_link', 'chunk_text'])
200
- except Exception as e:
201
- log_message(f"⚠️ Could not load chunks: {str(e)}")
202
- chunks_df = pd.DataFrame(columns=['chunk_id', 'document_id', 'document_name', 'document_link', 'chunk_text'])
203
-
204
- query_engine = None
205
- if not chunks_df.empty:
206
- log_message("πŸ€– Setting up AI models...")
207
- embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
208
-
209
- log_message("πŸ“ Creating document objects from existing chunks...")
210
- documents = [Document(text=str(row['chunk_text']),
211
- metadata={
212
- "chunk_id": row.get('chunk_id', i),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  "document_id": row.get('document_id', 'unknown'),
214
  "document_name": row.get('document_name', 'unknown'),
215
- "document_link": row.get('document_link', '')
216
- })
217
- for i, (_, row) in enumerate(chunks_df.iterrows())]
218
-
219
- log_message("πŸ” Building vector index...")
220
- vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
221
-
222
- retriever = VectorIndexRetriever(
223
- index=vector_index,
224
- similarity_top_k=RETRIEVER_TOP_K,
225
- similarity_cutoff=SIMILARITY_THRESHOLD
226
- )
227
-
228
- custom_prompt_template = PromptTemplate(CUSTOM_PROMPT)
229
- response_synthesizer = get_response_synthesizer(
230
- response_mode=ResponseMode.TREE_SUMMARIZE,
231
- text_qa_template=custom_prompt_template
232
- )
233
-
234
- query_engine = RetrieverQueryEngine(
235
- retriever=retriever,
236
- response_synthesizer=response_synthesizer
237
- )
238
 
239
  log_message("βœ… System successfully initialized!")
240
  return query_engine, chunks_df, True
241
 
242
  except Exception as e:
243
  log_message(f"❌ Initialization error: {str(e)}")
244
- return None, None, False
 
 
10
  from llama_index.core.prompts import PromptTemplate
11
  from config import *
12
  import shutil
13
+ import faiss
14
+ from huggingface_hub import hf_hub_download
15
+
16
 
17
  def log_message(message):
18
  print(message, flush=True)
 
187
  return None, existing_chunks_df, str(e)
188
 
189
  def initialize_system():
190
+ global query_engine, chunks_df
191
+
192
  try:
193
+ log_message("πŸ”„ Initializing system...")
194
  os.makedirs(download_dir, exist_ok=True)
195
 
196
+ log_message("πŸ“₯ Loading files...")
197
+ faiss_index_path = hf_hub_download(
198
+ repo_id=REPO_ID,
199
+ filename=faiss_index_filename,
200
+ local_dir=download_dir,
201
+ repo_type="dataset",
202
+ token=HF_TOKEN
203
+ )
204
+
205
+ chunks_csv_path = hf_hub_download(
206
+ repo_id=REPO_ID,
207
+ filename=chunks_filename,
208
+ local_dir=download_dir,
209
+ repo_type="dataset",
210
+ token=HF_TOKEN
211
+ )
212
+
213
+ log_message("πŸ“š Loading index and data...")
214
+ index_faiss = faiss.read_index(faiss_index_path)
215
+ chunks_df = pd.read_csv(chunks_csv_path)
216
+
217
+ log_message("πŸ€– Setting up models...")
218
+ embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
219
+
220
+ text_column = None
221
+ for col in chunks_df.columns:
222
+ if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
223
+ text_column = col
224
+ break
225
+
226
+ if text_column is None:
227
+ text_column = chunks_df.columns[0]
228
+
229
+ log_message("πŸ“ Creating documents...")
230
+ documents = [Document(text=str(row[text_column]),
231
+ metadata={"chunk_id": row.get('chunk_id', i),
232
  "document_id": row.get('document_id', 'unknown'),
233
  "document_name": row.get('document_name', 'unknown'),
234
+ "document_link": row.get('document_link', '')})
235
+ for i, (_, row) in enumerate(chunks_df.iterrows())]
236
+
237
+ log_message("πŸ” Building vector index...")
238
+ vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
239
+
240
+ retriever = VectorIndexRetriever(
241
+ index=vector_index,
242
+ similarity_top_k=RETRIEVER_TOP_K,
243
+ similarity_cutoff=SIMILARITY_THRESHOLD
244
+ )
245
+
246
+ custom_prompt_template = PromptTemplate(CUSTOM_PROMPT)
247
+ response_synthesizer = get_response_synthesizer(
248
+ response_mode=ResponseMode.TREE_SUMMARIZE,
249
+ text_qa_template=custom_prompt_template
250
+ )
251
+
252
+ query_engine = RetrieverQueryEngine(
253
+ retriever=retriever,
254
+ response_synthesizer=response_synthesizer
255
+ )
 
256
 
257
  log_message("βœ… System successfully initialized!")
258
  return query_engine, chunks_df, True
259
 
260
  except Exception as e:
261
  log_message(f"❌ Initialization error: {str(e)}")
262
+ chunks_df = pd.DataFrame(columns=['chunk_id', 'document_id', 'document_name', 'document_link', 'chunk_text'])
263
+ return None, chunks_df, False