vthamaraikannan1@gmail.com commited on
Commit
646b9b3
·
1 Parent(s): eab9192

Enhance streamlit_app.py with improved document context handling and UI updates; add .gitignore for environment and build files

Browse files
Files changed (2) hide show
  1. .gitignore +37 -0
  2. src/streamlit_app.py +141 -118
.gitignore ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore Python cache and virtual environments
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ *.db
7
+ *.sqlite3
8
+
9
+ # Ignore virtual environment folders
10
+ venv/
11
+ env/
12
+ ENV/
13
+ .venv/
14
+ .idea/
15
+ .vscode/
16
+
17
+ # Ignore OS generated files
18
+ .DS_Store
19
+ Thumbs.db
20
+
21
+ # Ignore logs and temp files
22
+ *.log
23
+ *.tmp
24
+
25
+ # Ignore test and coverage files
26
+ .coverage
27
+ htmlcov/
28
+ *.egg-info/
29
+ dist/
30
+ build/
31
+
32
+ # Ignore node modules if present
33
+ node_modules/
34
+
35
+ # Ignore Docker files themselves
36
+ Dockerfile
37
+ .dockerignore
src/streamlit_app.py CHANGED
@@ -16,10 +16,12 @@ nltk.download("punkt_tab", quiet=True)
16
 
17
 
18
 
 
19
  PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
20
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
21
  HF_TOKEN = os.environ.get("HF_TOKEN")
22
 
 
23
  # -------------------------------
24
  # Page Configuration
25
  # -------------------------------
@@ -214,7 +216,7 @@ def initialize_pinecone():
214
 
215
  @st.cache_resource(show_spinner=False)
216
  def initialize_bm25():
217
- with open("src/bm25_model.pkl", "rb") as f:
218
  bm25 = pickle.load(f)
219
  return bm25
220
 
@@ -273,20 +275,47 @@ def generate_ai_response(query, relevant_docs):
273
 
274
  # Prepare context from relevant documents
275
  context_parts = []
 
276
  for i, doc in enumerate(relevant_docs, 1):
277
  metadata = doc["metadata"]
278
  text = metadata.get("text")
279
-
280
-
281
- context_parts.append(f"""{text}""")
282
-
283
- context = "\n".join(context_parts)
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
  # Create the prompt for Groq
286
  prompt = f"""
 
 
 
 
 
 
 
 
 
287
  CONTEXT DOCUMENTS:
288
  {context}
 
289
  USER QUESTION: {query}
 
 
 
 
290
  """
291
 
292
  try:
@@ -295,7 +324,7 @@ def generate_ai_response(query, relevant_docs):
295
  messages=[
296
  {
297
  "role": "system",
298
- "content": """You are a professional assistant that answers user questions based **only on the content of provided document excerpts**. The user will ask a question, and you will also receive related text chunks retrieved from company documents or PDFs.
299
 
300
  Instructions:
301
  1. Use **only** the retrieved chunks to answer the user’s question. Do **not** add information from memory or outside sources.
@@ -327,9 +356,9 @@ def generate_ai_response(query, relevant_docs):
327
  # -------------------------------
328
  st.markdown("""
329
  <div class="main-header">
330
- <h1 style="margin: 0; font-size: 1.9rem;">🔍 AI Document Search & Chat</h1>
331
  <p style="margin: 0.5rem 0 0 0; font-size: 1.1rem; opacity: 0.9;">
332
- Intelligent document retrieval with AI-powered question answering
333
  </p>
334
  </div>
335
  """, unsafe_allow_html=True)
@@ -338,57 +367,63 @@ st.markdown("""
338
  # Sidebar for filters and mode toggle
339
  # -------------------------------
340
  def clear_all_filters():
341
- st.session_state.doc_type_filter = "All Types"
 
 
 
 
342
  st.session_state.company_filter = ""
343
  st.session_state.fiscal_year_filter = ""
344
- st.session_state.page_no_filter = ""
345
- st.session_state.search_query = ""
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  with st.sidebar:
348
- # Mode toggle
349
- st.markdown("### 🤖 Search Mode")
350
-
351
- chat_mode = st.toggle(
352
- "💬 AI Chat Mode",
353
- value=st.session_state.chat_mode,
354
- help="Enable AI chat responses based on document content"
355
- )
356
- st.session_state.chat_mode = chat_mode
357
-
358
- if chat_mode:
359
- st.success("🤖 AI Chat Mode: ON\nGet AI-generated responses based on document content")
360
- else:
361
- st.info("📋 Search Mode: Document results only")
362
-
363
- st.markdown("---")
364
  st.markdown("### 🎯 Search Filters")
365
 
366
  doc_type = st.selectbox(
367
- "📄 Document Type",
368
- ["All Types", "annual_report", "contract_report"],
369
  key="doc_type_filter"
370
  )
371
 
372
- # company = st.text_input(
373
- # "🏢 Company",
374
- # placeholder="Enter company name...",
375
- # key="company_filter"
376
- # )
377
-
378
- # fiscal_year = st.text_input(
379
- # "📅 Fiscal Year",
380
- # placeholder="e.g., 2023",
381
- # key="fiscal_year_filter"
382
- # )
383
-
384
- page_no = st.text_input(
385
- "📃 Page Number",
386
- placeholder="e.g., 15",
387
- key="page_no_filter"
388
- )
 
 
 
 
 
 
 
 
389
 
390
- # Clear filters button
391
- st.button("🗑️ Clear All Filters", on_click=clear_all_filters)
392
 
393
  # Model info
394
  st.markdown("---")
@@ -431,18 +466,49 @@ if search_clicked or (query and len(query.strip()) > 0):
431
  else:
432
  # Build filter dictionary
433
  filter_dict = {}
 
 
434
  if doc_type and doc_type != "All Types":
435
  filter_dict["doc_type"] = {"$eq": doc_type}
436
- # if company.strip():
437
- # filter_dict["company"] = {"$eq": company.strip()}
438
- # if fiscal_year.strip():
439
- # filter_dict["fiscal_year"] = {"$eq": fiscal_year.strip()}
440
- if page_no.strip():
441
  try:
442
  filter_dict["page_no"] = {"$eq": int(page_no.strip())}
443
  except ValueError:
444
  st.error("⚠️ Page number must be a valid integer.")
445
  st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
 
448
 
@@ -459,67 +525,35 @@ if search_clicked or (query and len(query.strip()) > 0):
459
  ai_response = generate_ai_response(query, relevant_docs)
460
 
461
  # Display AI response
462
- st.markdown(ai_response,unsafe_allow_html=True)
 
463
 
464
 
465
 
466
  st.markdown("---")
467
- # Display applied filters
468
- if filter_dict:
469
- st.markdown("### 📌 Applied Filters")
470
- filter_chips = ""
471
- for key, value in filter_dict.items():
472
- filter_value = value.get("$eq", "")
473
- filter_chips += f'<span class="metadata-chip">{key}: {filter_value}</span>'
474
- st.markdown(filter_chips, unsafe_allow_html=True)
475
 
476
  if relevant_docs:
477
  search_time = time.time() - start_time
478
 
479
- # Display search statistics
480
- st.markdown(f"""
481
- <div class="stats-container">
482
- <div style="display: flex; justify-content: space-between; align-items: center;">
483
- <div>
484
- <strong>🎯 Found {len(relevant_docs)} relevant results</strong>
485
- </div>
486
- <div>
487
- <strong>⚡ {search_time:.2f}s</strong>
488
- </div>
489
- </div>
490
- </div>
491
- """, unsafe_allow_html=True)
492
  # Display source documents
493
  if st.session_state.chat_mode:
494
- st.markdown("### 📚 Source Documents")
495
- else:
496
- st.markdown("### 📋 Search Results")
497
 
498
  for i, result in enumerate(relevant_docs, start=1):
499
  metadata = result["metadata"]
500
  text_content = metadata.get("text", "No text available")
501
- rerank_score = result["rerank_score"]
502
-
503
- # Create result card
504
- st.markdown(f"""
505
- <div >
506
- <div style="display: flex; justify-content: between; align-items: flex-start; margin-bottom: 1rem;">
507
- <h4 style="margin: 0; color: #f2f5f7; flex-grow: 1;">{"" if st.session_state.chat_mode else "Result"} #{i}</h4>
508
- </div>
509
- """, unsafe_allow_html=True)
510
-
511
- # Display metadata as chips
512
- st.markdown("#### 📊 Metadata:")
513
- metadata_chips = ""
514
- for key, value in metadata.items():
515
- if key != "text": # Don't show text in metadata chips
516
- metadata_chips += f'<span class="metadata-chip">{key}: {value}</span>'
517
 
518
- if metadata_chips:
519
- st.markdown(metadata_chips, unsafe_allow_html=True)
520
-
521
- # Display text content
522
- st.markdown(f"#### 📝 Content:")
523
  st.markdown(f'<div style="background: #303336; padding: 1rem; border-radius: 8px; margin: 1rem 0; line-height: 1.6;">{text_content}</div>', unsafe_allow_html=True)
524
 
525
 
@@ -554,24 +588,13 @@ if search_clicked or (query and len(query.strip()) > 0):
554
  if not query:
555
  st.markdown("---")
556
  st.markdown("### 💡 How to Use")
557
-
558
- col1, col2 = st.columns(2)
559
-
560
- with col1:
561
- st.markdown("""
562
- **🔍 Search Mode:**
563
- - Enter keywords to find relevant documents
564
- - Results show document excerpts and metadata
565
- - Use filters to narrow down results
566
- """)
567
-
568
- with col2:
569
- st.markdown("""
570
- **💬 AI Chat Mode:**
571
- - Ask natural language questions
572
- - Get AI-generated answers based on documents
573
- - View source documents used for the response
574
- """)
575
 
576
  # -------------------------------
577
  # Footer
 
16
 
17
 
18
 
19
+
20
  PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
21
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
22
  HF_TOKEN = os.environ.get("HF_TOKEN")
23
 
24
+
25
  # -------------------------------
26
  # Page Configuration
27
  # -------------------------------
 
216
 
217
  @st.cache_resource(show_spinner=False)
218
  def initialize_bm25():
219
+ with open(r"D:\rag_hugging\AI-DocumentSearch\src\bm25_model.pkl", "rb") as f:
220
  bm25 = pickle.load(f)
221
  return bm25
222
 
 
275
 
276
  # Prepare context from relevant documents
277
  context_parts = []
278
+ sources = []
279
  for i, doc in enumerate(relevant_docs, 1):
280
  metadata = doc["metadata"]
281
  text = metadata.get("text")
282
+ doc_id = metadata.get("doc_id")
283
+ title = metadata.get("title")
284
+ fiscal_year = metadata.get("fiscal_year")
285
+ page_no = metadata.get("page_no")
286
+
287
+ # Context for LLM
288
+ context_parts.append(f"[CHUNK {i} DOC {doc_id} {title} fiscal year {fiscal_year} ] (Page {page_no})\n{text}")
289
+
290
+ # Collect for UI
291
+ sources.append({
292
+ "id": i,
293
+ "title": title,
294
+ "page": page_no,
295
+ "doc_type": metadata.get("doc_type", ""),
296
+ })
297
+
298
+ context = "\n\n".join(context_parts)
299
 
300
  # Create the prompt for Groq
301
  prompt = f"""
302
+
303
+ You will answer the question using ONLY the provided document excerpts.
304
+
305
+ When you use information from a document, cite it with the format [DOC i],
306
+ where i corresponds to the document number given in CONTEXT DOCUMENTS.
307
+
308
+ If multiple docs are relevant, cite all of them (e.g., [DOC 1][DOC 3]).
309
+
310
+
311
  CONTEXT DOCUMENTS:
312
  {context}
313
+
314
  USER QUESTION: {query}
315
+
316
+ ANSWER : " "
317
+
318
+
319
  """
320
 
321
  try:
 
324
  messages=[
325
  {
326
  "role": "system",
327
+ "content": """You are a professional assistant that answers user questions based **only on the content of provided document excerpts**. The user will ask a question, and you will also receive related text chunks retrieved from company documents or PDFs.
328
 
329
  Instructions:
330
  1. Use **only** the retrieved chunks to answer the user’s question. Do **not** add information from memory or outside sources.
 
356
  # -------------------------------
357
  st.markdown("""
358
  <div class="main-header">
359
+ <h1 style="margin: 0; font-size: 1.9rem;"> Hybrid Search RAG </h1>
360
  <p style="margin: 0.5rem 0 0 0; font-size: 1.1rem; opacity: 0.9;">
361
+ Using Groq LLM, Pinecone, and Sentence Transformers
362
  </p>
363
  </div>
364
  """, unsafe_allow_html=True)
 
367
  # Sidebar for filters and mode toggle
368
  # -------------------------------
369
  def clear_all_filters():
370
+ # Common
371
+ st.session_state.search_query = ""
372
+ st.session_state.page_no_filter = ""
373
+
374
+ # Annual Report
375
  st.session_state.company_filter = ""
376
  st.session_state.fiscal_year_filter = ""
377
+ st.session_state.currency_filter = ""
378
+ st.session_state.segment_filter = ""
379
+
380
+ # Contract Report
381
+ st.session_state.agreement_date_filter = ""
382
+ st.session_state.promoter_filter = ""
383
+ st.session_state.allottee_filter = ""
384
+ st.session_state.project_name_filter = ""
385
+ st.session_state.apartment_block_filter = ""
386
+ st.session_state.apartment_floor_filter = ""
387
+ st.session_state.apartment_type_filter = ""
388
+ # st.session_state.carpet_area_filter = "" # if you add this back
389
+ st.session_state.jurisdiction_filter = ""
390
+
391
 
392
  with st.sidebar:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  st.markdown("### 🎯 Search Filters")
394
 
395
  doc_type = st.selectbox(
396
+ "Document Type",
397
+ ["annual_report", "contract_report"],
398
  key="doc_type_filter"
399
  )
400
 
401
+ # Annual Report filters
402
+ if doc_type == "annual_report":
403
+ with st.expander("Annual Report Filters", expanded=False):
404
+ company = st.text_input("Company", placeholder="Enter company name...", key="company_filter")
405
+ fiscal_year = st.text_input("Fiscal Year", placeholder="e.g., 2024", key="fiscal_year_filter")
406
+ currency = st.text_input("Currency", placeholder="e.g., USD", key="currency_filter")
407
+ segment = st.text_input("Segment", placeholder="e.g., Paint Stores Group", key="segment_filter")
408
+ page_no = st.text_input("Page Number", placeholder="e.g., 15", key="page_no_filter")
409
+
410
+ # Contract Report filters
411
+ elif doc_type == "contract_report":
412
+ with st.expander("Contract Report Filters", expanded=False):
413
+ agreement_date = st.text_input("Agreement Date", placeholder="YYYY-MM-DD", key="agreement_date_filter")
414
+ promoter = st.text_input("Promoter / Developer", placeholder="Enter promoter name...", key="promoter_filter")
415
+ allottee = st.text_input("Allottee (Buyer)", placeholder="Enter allottee name...", key="allottee_filter")
416
+ project_name = st.text_input("Project Name", placeholder="Enter project name...", key="project_name_filter")
417
+ apartment_block = st.text_input("Block", placeholder="e.g., Tower A", key="apartment_block_filter")
418
+ apartment_floor = st.text_input("Floor", placeholder="e.g., 10th floor", key="apartment_floor_filter")
419
+ apartment_type = st.text_input("Apartment Type", placeholder="e.g., 2BHK", key="apartment_type_filter")
420
+ jurisdiction = st.text_input("Jurisdiction", placeholder="e.g., Madras High Court", key="jurisdiction_filter")
421
+ page_no = st.text_input("Page Number", placeholder="e.g., 15", key="page_no_filter")
422
+
423
+ # Reset button
424
+ st.button("Clear All Filters", on_click=clear_all_filters)
425
+
426
 
 
 
427
 
428
  # Model info
429
  st.markdown("---")
 
466
  else:
467
  # Build filter dictionary
468
  filter_dict = {}
469
+
470
+ # Common filters
471
  if doc_type and doc_type != "All Types":
472
  filter_dict["doc_type"] = {"$eq": doc_type}
473
+
474
+ if page_no and page_no.strip():
 
 
 
475
  try:
476
  filter_dict["page_no"] = {"$eq": int(page_no.strip())}
477
  except ValueError:
478
  st.error("⚠️ Page number must be a valid integer.")
479
  st.stop()
480
+
481
+ # Annual Report filters
482
+ if doc_type == "annual_report":
483
+ if company and company.strip():
484
+ filter_dict["company"] = {"$eq": company.strip()}
485
+ if fiscal_year and fiscal_year.strip():
486
+ filter_dict["fiscal_year"] = {"$eq": fiscal_year.strip()}
487
+ if currency and currency.strip():
488
+ filter_dict["currency"] = {"$eq": currency.strip()}
489
+ if segment and segment.strip():
490
+ filter_dict["segment"] = {"$eq": segment.strip()}
491
+
492
+ # Contract Report filters
493
+ elif doc_type == "contract_report":
494
+ if agreement_date and agreement_date.strip():
495
+ filter_dict["agreement_date"] = {"$eq": agreement_date.strip()}
496
+ if promoter and promoter.strip():
497
+ filter_dict["promoter_legal_name"] = {"$eq": promoter.strip()}
498
+ if allottee and allottee.strip():
499
+ filter_dict["allottee_name"] = {"$eq": allottee.strip()}
500
+ if project_name and project_name.strip():
501
+ filter_dict["project_name"] = {"$eq": project_name.strip()}
502
+ if apartment_block and apartment_block.strip():
503
+ filter_dict["apartment_block"] = {"$eq": apartment_block.strip()}
504
+ if apartment_floor and apartment_floor.strip():
505
+ filter_dict["apartment_floor"] = {"$eq": apartment_floor.strip()}
506
+ if apartment_type and apartment_type.strip():
507
+ filter_dict["apartment_type"] = {"$eq": apartment_type.strip()}
508
+ if jurisdiction and jurisdiction.strip():
509
+ filter_dict["jurisdiction"] = {"$eq": jurisdiction.strip()}
510
+
511
+
512
 
513
 
514
 
 
525
  ai_response = generate_ai_response(query, relevant_docs)
526
 
527
  # Display AI response
528
+ # st.markdown(ai_response,unsafe_allow_html=True)
529
+ st.markdown(f'<div style="background: #303336; padding: 1rem; border-radius: 8px; margin: 1rem 0; line-height: 1.6;">{ai_response}</div>', unsafe_allow_html=True)
530
 
531
 
532
 
533
  st.markdown("---")
534
+
 
 
 
 
 
 
 
535
 
536
  if relevant_docs:
537
  search_time = time.time() - start_time
538
 
539
+
540
+
 
 
 
 
 
 
 
 
 
 
 
541
  # Display source documents
542
  if st.session_state.chat_mode:
543
+ st.markdown("### Evidence")
544
+ # else:
545
+ # st.markdown("### 📋 Search Results")
546
 
547
  for i, result in enumerate(relevant_docs, start=1):
548
  metadata = result["metadata"]
549
  text_content = metadata.get("text", "No text available")
550
+ doc_id = metadata.get("doc_id", "N/A")
551
+ page_no = metadata.get("page_no", "N/A")
552
+ title = metadata.get("title")
553
+
 
 
 
 
 
 
 
 
 
 
 
 
554
 
555
+ st.markdown("#### [{i}] DOC : {doc_id} | Page: {page_no} | Title {title}".format(i=i, doc_id=doc_id, page_no=page_no, title=title))
556
+
 
 
 
557
  st.markdown(f'<div style="background: #303336; padding: 1rem; border-radius: 8px; margin: 1rem 0; line-height: 1.6;">{text_content}</div>', unsafe_allow_html=True)
558
 
559
 
 
588
  if not query:
589
  st.markdown("---")
590
  st.markdown("### 💡 How to Use")
591
+
592
+ st.markdown("""
593
+ **💬 AI Chat Mode:**
594
+ - Ask natural language questions
595
+ - Get AI-generated answers based on documents
596
+ - View source documents used for the response
597
+ """)
 
 
 
 
 
 
 
 
 
 
 
598
 
599
  # -------------------------------
600
  # Footer