omarkashif commited on
Commit
8d16824
·
verified ·
1 Parent(s): 8ca6217

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +37 -19
src/streamlit_app.py CHANGED
@@ -10,7 +10,7 @@ import openai
10
  from collections import deque
11
  from sentence_transformers import SentenceTransformer
12
  from pinecone import Pinecone
13
-
14
 
15
  # Setup (exact hardcoded keys you provided)
16
  client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
@@ -56,38 +56,56 @@ def retrieve_documents(query, top_k=10):
56
  st.error(f"Retrieve error: {e}")
57
  return []
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def generate_response(user_query, docs):
60
  context = "\n\n---\n\n".join(d['metadata']['text'] for d in docs)
61
  # sources = sorted({d['metadata']['chunk_id'] for d in docs if 'source' in d['metadata']})
 
62
  readable_sources = []
63
  for d in docs:
64
  meta = d['metadata']
65
- src = meta.get("source", "unknown")
66
- cid = meta.get("chunk_id", "N/A")
 
 
 
 
67
 
68
- if src.lower() in ["constitution", "fbr_ordinance", "ordinance"]:
69
- # For constitution and ordinances, chunk_id is enough
70
- readable_sources.append(f"{src.title()} (Chunk {cid})")
71
 
72
- elif src.lower() in ["case_law", "case", "tax_case"]:
73
- # For case law, add first ~30 words of text
74
- text_preview = " ".join(meta.get("text", "").split()[:30])
75
- readable_sources.append(f"Case Law (Chunk {cid}): {text_preview}...")
76
 
77
  else:
78
- readable_sources.append(f"{src.title()} (Chunk {cid})")
79
 
80
- # Deduplicate sources
81
  readable_sources = sorted(set(readable_sources))
82
-
 
83
  messages = [
84
  {"role": "system", "content":
85
- "You are a helpful legal assistant. Use the provided context from the documents to answer the user's question. "
86
- "At the end of your answer, write a single line starting with 'Source: ' and list the sources of the documents you used. "
87
- "Sources should be written in a human-friendly way using metadata:\n"
88
- "- Constitution / Ordinances just mention source name and chunk ID.\n"
89
- "- Case law mention source name, chunk ID, and first ~30 words of the text as a preview.\n"
90
- "Do not invent sources. If multiple are used, separate them with commas."}
 
91
  ]
92
 
93
 
 
10
  from collections import deque
11
  from sentence_transformers import SentenceTransformer
12
  from pinecone import Pinecone
13
+ import re
14
 
15
  # Setup (exact hardcoded keys you provided)
16
  client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
56
  st.error(f"Retrieve error: {e}")
57
  return []
58
 
59
+
60
+ def clean_chunk_id(cid: str) -> str:
61
+ """Beautify chunk_id by replacing underscores/dashes with spaces and capitalizing words."""
62
+ # Remove any trailing '_chunk_xxx' stuff
63
+ cid = re.sub(r'_chunk.*$', '', cid)
64
+ # Replace _ and - with spaces
65
+ cid = cid.replace("_", " ").replace("-", " ")
66
+ # Capitalize each word
67
+ cid = " ".join(word.capitalize() for word in cid.split())
68
+ return cid
69
+
70
+
71
+
72
  def generate_response(user_query, docs):
73
  context = "\n\n---\n\n".join(d['metadata']['text'] for d in docs)
74
  # sources = sorted({d['metadata']['chunk_id'] for d in docs if 'source' in d['metadata']})
75
+ # --- Build human-friendly sources ---
76
  readable_sources = []
77
  for d in docs:
78
  meta = d['metadata']
79
+ src = meta.get("source", "unknown").lower()
80
+ cid = meta.get("chunk_id", "")
81
+ text_preview = " ".join(meta.get("text", "").split()[:30])
82
+
83
+ if src in ["constitution"]:
84
+ readable_sources.append(f"Constitution ({clean_chunk_id(cid)})")
85
 
86
+ elif src in ["fbr_ordinance", "ordinance", "tax_ordinance"]:
87
+ readable_sources.append(f"Tax Ordinance ({clean_chunk_id(cid)})")
 
88
 
89
+ elif src in ["case_law", "case", "tax_case"]:
90
+ # Use first ~30 words of the actual text
91
+ readable_sources.append(f"Case Law: {text_preview}...")
 
92
 
93
  else:
94
+ readable_sources.append(f"{src.title()} ({clean_chunk_id(cid)})")
95
 
96
+ # Deduplicate and sort
97
  readable_sources = sorted(set(readable_sources))
98
+
99
+ # --- System prompt ---
100
  messages = [
101
  {"role": "system", "content":
102
+ "You are a helpful legal assistant. Use the provided context from documents to answer the user's question. "
103
+ "At the end of your answer, write a single line starting with 'Source: ' followed by the sources used. "
104
+ "Formatting rules:\n"
105
+ "- For Constitution / Ordinances: show the clean chunk id, no underscores/dashes, capitalized words.\n"
106
+ "- For Case law: ignore chunk id, instead show first ~30 words of the case text.\n"
107
+ "- Do not use technical terms like 'chunk'. Present sources in a human-friendly way.\n"
108
+ "If multiple are used, separate them with commas."}
109
  ]
110
 
111