Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +37 -19
src/streamlit_app.py
CHANGED
|
@@ -10,7 +10,7 @@ import openai
|
|
| 10 |
from collections import deque
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
from pinecone import Pinecone
|
| 13 |
-
|
| 14 |
|
| 15 |
# Setup (exact hardcoded keys you provided)
|
| 16 |
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
@@ -56,38 +56,56 @@ def retrieve_documents(query, top_k=10):
|
|
| 56 |
st.error(f"Retrieve error: {e}")
|
| 57 |
return []
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
def generate_response(user_query, docs):
|
| 60 |
context = "\n\n---\n\n".join(d['metadata']['text'] for d in docs)
|
| 61 |
# sources = sorted({d['metadata']['chunk_id'] for d in docs if 'source' in d['metadata']})
|
|
|
|
| 62 |
readable_sources = []
|
| 63 |
for d in docs:
|
| 64 |
meta = d['metadata']
|
| 65 |
-
src = meta.get("source", "unknown")
|
| 66 |
-
cid = meta.get("chunk_id", "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
readable_sources.append(f"{src.title()} (Chunk {cid})")
|
| 71 |
|
| 72 |
-
elif src
|
| 73 |
-
#
|
| 74 |
-
|
| 75 |
-
readable_sources.append(f"Case Law (Chunk {cid}): {text_preview}...")
|
| 76 |
|
| 77 |
else:
|
| 78 |
-
readable_sources.append(f"{src.title()} (
|
| 79 |
|
| 80 |
-
# Deduplicate
|
| 81 |
readable_sources = sorted(set(readable_sources))
|
| 82 |
-
|
|
|
|
| 83 |
messages = [
|
| 84 |
{"role": "system", "content":
|
| 85 |
-
"You are a helpful legal assistant. Use the provided context from
|
| 86 |
-
"At the end of your answer, write a single line starting with 'Source: '
|
| 87 |
-
"
|
| 88 |
-
"- Constitution / Ordinances
|
| 89 |
-
"- Case law
|
| 90 |
-
"Do not
|
|
|
|
| 91 |
]
|
| 92 |
|
| 93 |
|
|
|
|
| 10 |
from collections import deque
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
from pinecone import Pinecone
|
| 13 |
+
import re
|
| 14 |
|
| 15 |
# Setup (exact hardcoded keys you provided)
|
| 16 |
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
|
|
| 56 |
st.error(f"Retrieve error: {e}")
|
| 57 |
return []
|
| 58 |
|
| 59 |
+
|
| 60 |
+
def clean_chunk_id(cid: str) -> str:
|
| 61 |
+
"""Beautify chunk_id by replacing underscores/dashes with spaces and capitalizing words."""
|
| 62 |
+
# Remove any trailing '_chunk_xxx' stuff
|
| 63 |
+
cid = re.sub(r'_chunk.*$', '', cid)
|
| 64 |
+
# Replace _ and - with spaces
|
| 65 |
+
cid = cid.replace("_", " ").replace("-", " ")
|
| 66 |
+
# Capitalize each word
|
| 67 |
+
cid = " ".join(word.capitalize() for word in cid.split())
|
| 68 |
+
return cid
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
def generate_response(user_query, docs):
|
| 73 |
context = "\n\n---\n\n".join(d['metadata']['text'] for d in docs)
|
| 74 |
# sources = sorted({d['metadata']['chunk_id'] for d in docs if 'source' in d['metadata']})
|
| 75 |
+
# --- Build human-friendly sources ---
|
| 76 |
readable_sources = []
|
| 77 |
for d in docs:
|
| 78 |
meta = d['metadata']
|
| 79 |
+
src = meta.get("source", "unknown").lower()
|
| 80 |
+
cid = meta.get("chunk_id", "")
|
| 81 |
+
text_preview = " ".join(meta.get("text", "").split()[:30])
|
| 82 |
+
|
| 83 |
+
if src in ["constitution"]:
|
| 84 |
+
readable_sources.append(f"Constitution ({clean_chunk_id(cid)})")
|
| 85 |
|
| 86 |
+
elif src in ["fbr_ordinance", "ordinance", "tax_ordinance"]:
|
| 87 |
+
readable_sources.append(f"Tax Ordinance ({clean_chunk_id(cid)})")
|
|
|
|
| 88 |
|
| 89 |
+
elif src in ["case_law", "case", "tax_case"]:
|
| 90 |
+
# Use first ~30 words of the actual text
|
| 91 |
+
readable_sources.append(f"Case Law: {text_preview}...")
|
|
|
|
| 92 |
|
| 93 |
else:
|
| 94 |
+
readable_sources.append(f"{src.title()} ({clean_chunk_id(cid)})")
|
| 95 |
|
| 96 |
+
# Deduplicate and sort
|
| 97 |
readable_sources = sorted(set(readable_sources))
|
| 98 |
+
|
| 99 |
+
# --- System prompt ---
|
| 100 |
messages = [
|
| 101 |
{"role": "system", "content":
|
| 102 |
+
"You are a helpful legal assistant. Use the provided context from documents to answer the user's question. "
|
| 103 |
+
"At the end of your answer, write a single line starting with 'Source: ' followed by the sources used. "
|
| 104 |
+
"Formatting rules:\n"
|
| 105 |
+
"- For Constitution / Ordinances: show the clean chunk id, no underscores/dashes, capitalized words.\n"
|
| 106 |
+
"- For Case law: ignore chunk id, instead show first ~30 words of the case text.\n"
|
| 107 |
+
"- Do not use technical terms like 'chunk'. Present sources in a human-friendly way.\n"
|
| 108 |
+
"If multiple are used, separate them with commas."}
|
| 109 |
]
|
| 110 |
|
| 111 |
|