Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +0 -0
- README.md +2 -0
- api/main.py +2 -0
- buglog.md +14 -0
- src/agent_v2.py +205 -165
- src/ner.py +49 -51
- src/system_prompt.py +143 -180
- src/verify.py +130 -29
Dockerfile
CHANGED
|
Binary files a/Dockerfile and b/Dockerfile differ
|
|
|
README.md
CHANGED
|
@@ -20,6 +20,8 @@ Ask questions about Indian Supreme Court judgments (1950–2024).
|
|
| 20 |
|
| 21 |
> Retrieval-Augmented Generation over 26,688 Supreme Court of India judgments (1950–2024).
|
| 22 |
> Ask a legal question. Get a cited answer grounded in real case law.
|
|
|
|
|
|
|
| 23 |
|
| 24 |
[](https://huggingface.co/spaces/CaffeinatedCoding/nyayasetu)
|
| 25 |
[](https://github.com/devangmishra1424/nyayasetu/actions)
|
|
|
|
| 20 |
|
| 21 |
> Retrieval-Augmented Generation over 26,688 Supreme Court of India judgments (1950–2024).
|
| 22 |
> Ask a legal question. Get a cited answer grounded in real case law.
|
| 23 |
+
> 1,025,764 chunks indexed (SC judgments, HC judgments, bare acts, constitution, legal references)
|
| 24 |
+
> V2 agent with 3-pass reasoning loop and conversation memory
|
| 25 |
|
| 26 |
[](https://huggingface.co/spaces/CaffeinatedCoding/nyayasetu)
|
| 27 |
[](https://github.com/devangmishra1424/nyayasetu/actions)
|
api/main.py
CHANGED
|
@@ -22,6 +22,8 @@ logger = logging.getLogger(__name__)
|
|
| 22 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 23 |
|
| 24 |
def download_models():
|
|
|
|
|
|
|
| 25 |
hf_token = os.getenv("HF_TOKEN")
|
| 26 |
if not hf_token:
|
| 27 |
logger.warning("HF_TOKEN not set — skipping model download.")
|
|
|
|
| 22 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 23 |
|
| 24 |
def download_models():
|
| 25 |
+
from src.ner import load_ner_model
|
| 26 |
+
load_ner_model()
|
| 27 |
hf_token = os.getenv("HF_TOKEN")
|
| 28 |
if not hf_token:
|
| 29 |
logger.warning("HF_TOKEN not set — skipping model download.")
|
buglog.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Bug 1: Kaggle Secrets wrong pattern
|
| 2 |
+
**Bug:** HF token upload failed — was using token value as key name
|
| 3 |
+
**Found:** Upload cell returned 401
|
| 4 |
+
**Fixed:** Changed to `hf_token = secrets.get_secret("HF_TOKEN")`
|
| 5 |
+
|
| 6 |
+
## Bug 2: Docker container caching old files
|
| 7 |
+
**Bug:** Frontend changes not reflecting after push
|
| 8 |
+
**Found:** UI still showing old version after multiple pushes
|
| 9 |
+
**Fixed:** Must push change to Dockerfile to force container rebuild, not just restart
|
| 10 |
+
|
| 11 |
+
## Bug 3: OneDrive destroying git repository
|
| 12 |
+
**Bug:** All local project files disappeared
|
| 13 |
+
**Found:** OneDrive moved files to cloud-only to free local space
|
| 14 |
+
**Fixed:** Moved project to C:\Projects outside OneDrive. Never store git repos inside OneDrive.
|
src/agent_v2.py
CHANGED
|
@@ -1,38 +1,33 @@
|
|
| 1 |
"""
|
| 2 |
-
NyayaSetu V2 Agent —
|
| 3 |
|
| 4 |
-
Pass 1 — ANALYSE:
|
| 5 |
-
|
| 6 |
-
|
| 7 |
|
| 8 |
-
Pass 2 — RETRIEVE: Parallel FAISS search
|
| 9 |
-
from Pass 1. No LLM call. Pure vector search.
|
| 10 |
|
| 11 |
-
Pass 3 — RESPOND:
|
| 12 |
-
|
| 13 |
|
| 14 |
-
2 LLM calls per turn
|
| 15 |
-
src/agent.py is untouched — this is additive.
|
| 16 |
"""
|
| 17 |
|
| 18 |
-
import os
|
| 19 |
-
import sys
|
| 20 |
-
import json
|
| 21 |
-
import time
|
| 22 |
-
import logging
|
| 23 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 24 |
from typing import Dict, Any, List
|
| 25 |
|
|
|
|
| 26 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 27 |
|
| 28 |
from src.embed import embed_text
|
| 29 |
from src.retrieval import retrieve
|
| 30 |
from src.verify import verify_citations
|
| 31 |
from src.system_prompt import build_prompt, ANALYSIS_PROMPT
|
|
|
|
| 32 |
|
| 33 |
logger = logging.getLogger(__name__)
|
| 34 |
|
| 35 |
-
# ── Groq client (same as llm.py) ──────────────────────────
|
| 36 |
from groq import Groq
|
| 37 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 38 |
from dotenv import load_dotenv
|
|
@@ -40,89 +35,133 @@ from dotenv import load_dotenv
|
|
| 40 |
load_dotenv()
|
| 41 |
_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
| 42 |
|
| 43 |
-
# ──
|
| 44 |
-
# Resets on container restart — acceptable for free tier
|
| 45 |
sessions: Dict[str, Dict] = {}
|
| 46 |
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def get_or_create_session(session_id: str) -> Dict:
|
| 49 |
-
"""Get existing session or create a fresh one."""
|
| 50 |
if session_id not in sessions:
|
| 51 |
sessions[session_id] = {
|
| 52 |
"summary": "",
|
| 53 |
"last_3_messages": [],
|
| 54 |
-
"case_state":
|
| 55 |
-
"facts_established": [],
|
| 56 |
-
"facts_missing": [],
|
| 57 |
-
"hypotheses": [],
|
| 58 |
-
"retrieved_cases": [],
|
| 59 |
-
"stage": "intake",
|
| 60 |
-
"last_response_type": "none"
|
| 61 |
-
}
|
| 62 |
}
|
| 63 |
return sessions[session_id]
|
| 64 |
|
| 65 |
|
| 66 |
def update_session(session_id: str, analysis: Dict, user_message: str, response: str):
|
| 67 |
-
"""Update session state after each turn."""
|
| 68 |
session = sessions[session_id]
|
|
|
|
| 69 |
|
| 70 |
-
# Update summary from Pass 1 output
|
| 71 |
if analysis.get("updated_summary"):
|
| 72 |
session["summary"] = analysis["updated_summary"]
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
# Update case state
|
| 81 |
-
cs = session["case_state"]
|
| 82 |
cs["stage"] = analysis.get("stage", cs["stage"])
|
| 83 |
cs["last_response_type"] = analysis.get("action_needed", "none")
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
if
|
| 86 |
-
cs["
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
|
| 93 |
|
| 94 |
# ── Pass 1: Analyse ───────────────────────────────────────
|
| 95 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=4))
|
| 96 |
def analyse(user_message: str, session: Dict) -> Dict:
|
| 97 |
-
"""
|
| 98 |
-
LLM call 1: Understand the message, detect intent,
|
| 99 |
-
form search queries, update summary.
|
| 100 |
-
Returns structured analysis dict.
|
| 101 |
-
"""
|
| 102 |
summary = session.get("summary", "")
|
| 103 |
last_msgs = session.get("last_3_messages", [])
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
user_content = f"""CONVERSATION SUMMARY:
|
| 115 |
-
{summary if summary else "
|
| 116 |
|
| 117 |
RECENT MESSAGES:
|
| 118 |
{history_text if history_text else "None"}
|
| 119 |
|
| 120 |
LAST RESPONSE TYPE: {last_response_type}
|
|
|
|
|
|
|
| 121 |
|
| 122 |
NEW USER MESSAGE:
|
| 123 |
{user_message}
|
| 124 |
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
response = _client.chat.completions.create(
|
| 128 |
model="llama-3.3-70b-versatile",
|
|
@@ -131,31 +170,27 @@ Remember: If last_response_type was "question", action_needed CANNOT be "questio
|
|
| 131 |
{"role": "user", "content": user_content}
|
| 132 |
],
|
| 133 |
temperature=0.1,
|
| 134 |
-
max_tokens=
|
| 135 |
)
|
| 136 |
|
| 137 |
raw = response.choices[0].message.content.strip()
|
| 138 |
-
|
| 139 |
-
# Parse JSON — strip any accidental markdown fences
|
| 140 |
raw = raw.replace("```json", "").replace("```", "").strip()
|
| 141 |
|
| 142 |
try:
|
| 143 |
analysis = json.loads(raw)
|
| 144 |
except json.JSONDecodeError:
|
| 145 |
logger.warning(f"Pass 1 JSON parse failed: {raw[:200]}")
|
| 146 |
-
# Fallback analysis
|
| 147 |
analysis = {
|
| 148 |
-
"tone": "casual",
|
| 149 |
-
"
|
| 150 |
-
"subject": "legal query",
|
| 151 |
-
"action_needed": "advice",
|
| 152 |
"urgency": "medium",
|
| 153 |
-
"
|
| 154 |
-
"facts_missing": [],
|
| 155 |
-
"stage": "understanding",
|
| 156 |
-
"
|
| 157 |
-
"
|
| 158 |
-
"
|
|
|
|
| 159 |
}
|
| 160 |
|
| 161 |
return analysis
|
|
@@ -163,11 +198,6 @@ Remember: If last_response_type was "question", action_needed CANNOT be "questio
|
|
| 163 |
|
| 164 |
# ── Pass 2: Retrieve ──────────────────────────────────────
|
| 165 |
def retrieve_parallel(search_queries: List[str], top_k: int = 5) -> List[Dict]:
|
| 166 |
-
"""
|
| 167 |
-
Run multiple FAISS queries in parallel.
|
| 168 |
-
Merge results, deduplicate by chunk_id, re-rank by score.
|
| 169 |
-
Returns top_k unique chunks.
|
| 170 |
-
"""
|
| 171 |
if not search_queries:
|
| 172 |
return []
|
| 173 |
|
|
@@ -176,101 +206,117 @@ def retrieve_parallel(search_queries: List[str], top_k: int = 5) -> List[Dict]:
|
|
| 176 |
def search_one(query):
|
| 177 |
try:
|
| 178 |
embedding = embed_text(query)
|
| 179 |
-
|
| 180 |
-
return results
|
| 181 |
except Exception as e:
|
| 182 |
-
logger.warning(f"FAISS search failed
|
| 183 |
return []
|
| 184 |
|
| 185 |
-
# Run queries in parallel
|
| 186 |
with ThreadPoolExecutor(max_workers=min(3, len(search_queries))) as executor:
|
| 187 |
futures = {executor.submit(search_one, q): q for q in search_queries}
|
| 188 |
for future in as_completed(futures):
|
| 189 |
-
|
| 190 |
-
all_results.extend(results)
|
| 191 |
|
| 192 |
-
# Deduplicate by chunk_id, keep best score
|
| 193 |
seen = {}
|
| 194 |
for chunk in all_results:
|
| 195 |
cid = chunk.get("chunk_id") or chunk.get("judgment_id", "")
|
| 196 |
-
score = chunk.get("similarity_score",
|
| 197 |
if cid not in seen or score < seen[cid]["similarity_score"]:
|
| 198 |
seen[cid] = chunk
|
| 199 |
|
| 200 |
-
|
| 201 |
-
unique_chunks = sorted(seen.values(), key=lambda x: x.get("similarity_score", 999))
|
| 202 |
-
return unique_chunks[:top_k]
|
| 203 |
|
| 204 |
|
| 205 |
# ── Pass 3: Respond ───────────────────────────────────────
|
| 206 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=8))
|
| 207 |
-
def respond(
|
| 208 |
-
user_message: str,
|
| 209 |
-
analysis: Dict,
|
| 210 |
-
chunks: List[Dict],
|
| 211 |
-
session: Dict
|
| 212 |
-
) -> str:
|
| 213 |
-
"""
|
| 214 |
-
LLM call 2: Generate the final response.
|
| 215 |
-
Uses dynamically assembled prompt based on analysis.
|
| 216 |
-
"""
|
| 217 |
-
# Build dynamic system prompt
|
| 218 |
system_prompt = build_prompt(analysis)
|
|
|
|
| 219 |
|
| 220 |
-
# Build context from retrieved chunks
|
| 221 |
context_parts = []
|
| 222 |
-
for
|
| 223 |
source_type = chunk.get("source_type", "case_law")
|
| 224 |
title = chunk.get("title", "Unknown")
|
| 225 |
year = chunk.get("year", "")
|
| 226 |
jid = chunk.get("judgment_id", "")
|
| 227 |
text = chunk.get("expanded_context") or chunk.get("chunk_text") or chunk.get("text", "")
|
| 228 |
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
else:
|
| 238 |
-
header = f"[CASE: {title} | {year} | ID: {jid}]"
|
| 239 |
-
|
| 240 |
context_parts.append(f"{header}\n{text[:800]}")
|
| 241 |
|
| 242 |
context = "\n\n".join(context_parts) if context_parts else "No relevant sources retrieved."
|
| 243 |
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
summary = session.get("summary", "")
|
| 246 |
last_msgs = session.get("last_3_messages", [])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
history_text = "\n".join(
|
| 251 |
-
f"{m['role'].upper()}: {m['content'][:300]}"
|
| 252 |
-
for m in last_msgs[-4:]
|
| 253 |
-
)
|
| 254 |
-
|
| 255 |
-
user_content = f"""CONVERSATION CONTEXT:
|
| 256 |
-
{summary if summary else "First message in this conversation."}
|
| 257 |
|
| 258 |
RECENT CONVERSATION:
|
| 259 |
-
{history_text if history_text else "
|
|
|
|
| 260 |
|
| 261 |
RETRIEVED LEGAL SOURCES:
|
| 262 |
{context}
|
| 263 |
|
| 264 |
USER MESSAGE: {user_message}
|
| 265 |
|
| 266 |
-
|
| 267 |
-
- Legal
|
| 268 |
- Stage: {analysis.get('stage', 'understanding')}
|
| 269 |
- Urgency: {analysis.get('urgency', 'medium')}
|
| 270 |
-
- Response type
|
|
|
|
|
|
|
| 271 |
|
| 272 |
-
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
response = _client.chat.completions.create(
|
| 276 |
model="llama-3.3-70b-versatile",
|
|
@@ -279,7 +325,7 @@ Your own legal knowledge can be used for general reasoning and context."""
|
|
| 279 |
{"role": "user", "content": user_content}
|
| 280 |
],
|
| 281 |
temperature=0.3,
|
| 282 |
-
max_tokens=
|
| 283 |
)
|
| 284 |
|
| 285 |
return response.choices[0].message.content
|
|
@@ -287,79 +333,73 @@ Your own legal knowledge can be used for general reasoning and context."""
|
|
| 287 |
|
| 288 |
# ── Main entry point ──────────────────────────────────────
|
| 289 |
def run_query_v2(user_message: str, session_id: str) -> Dict[str, Any]:
|
| 290 |
-
"""
|
| 291 |
-
Main V2 pipeline. 3 passes per query.
|
| 292 |
-
Returns structured response dict compatible with existing API schema.
|
| 293 |
-
"""
|
| 294 |
start = time.time()
|
| 295 |
-
|
| 296 |
-
# Get or create session
|
| 297 |
session = get_or_create_session(session_id)
|
| 298 |
|
| 299 |
-
#
|
| 300 |
try:
|
| 301 |
analysis = analyse(user_message, session)
|
| 302 |
except Exception as e:
|
| 303 |
logger.error(f"Pass 1 failed: {e}")
|
| 304 |
analysis = {
|
| 305 |
-
"tone": "casual",
|
| 306 |
-
"
|
| 307 |
-
"subject": "legal query",
|
| 308 |
-
"action_needed": "advice",
|
| 309 |
"urgency": "medium",
|
| 310 |
-
"
|
| 311 |
-
"facts_missing": [],
|
| 312 |
-
"stage": "understanding",
|
| 313 |
-
"last_response_type": "none",
|
| 314 |
"updated_summary": user_message[:200],
|
| 315 |
-
"search_queries": [user_message[:200]]
|
|
|
|
|
|
|
| 316 |
}
|
| 317 |
|
| 318 |
-
#
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
search_queries = [user_message]
|
| 322 |
|
| 323 |
-
#
|
| 324 |
-
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
chunks = []
|
| 328 |
try:
|
| 329 |
chunks = retrieve_parallel(search_queries[:3], top_k=5)
|
| 330 |
except Exception as e:
|
| 331 |
-
logger.error(f"Pass 2
|
| 332 |
|
| 333 |
-
#
|
| 334 |
try:
|
| 335 |
answer = respond(user_message, analysis, chunks, session)
|
| 336 |
except Exception as e:
|
| 337 |
logger.error(f"Pass 3 failed: {e}")
|
| 338 |
if chunks:
|
| 339 |
fallback = "\n\n".join(
|
| 340 |
-
f"[{c.get('title', 'Source')}]\n{
|
| 341 |
for c in chunks[:3]
|
| 342 |
)
|
| 343 |
-
answer = f"
|
| 344 |
else:
|
| 345 |
answer = "I encountered an issue processing your request. Please try again."
|
| 346 |
|
| 347 |
-
# ── Verification ───────────────────────────────────────
|
| 348 |
verification_status, unverified_quotes = verify_citations(answer, chunks)
|
| 349 |
-
|
| 350 |
-
# ── Update session ─────────────────────────────────────
|
| 351 |
update_session(session_id, analysis, user_message, answer)
|
| 352 |
|
| 353 |
-
# ── Build response ─────────────────────────────────────
|
| 354 |
sources = []
|
| 355 |
for c in chunks:
|
|
|
|
|
|
|
| 356 |
sources.append({
|
| 357 |
"meta": {
|
| 358 |
-
"judgment_id":
|
|
|
|
| 359 |
"year": c.get("year", ""),
|
| 360 |
"chunk_index": c.get("chunk_index", 0),
|
| 361 |
"source_type": c.get("source_type", "case_law"),
|
| 362 |
-
"
|
| 363 |
},
|
| 364 |
"text": (c.get("expanded_context") or c.get("chunk_text") or c.get("text", ""))[:600]
|
| 365 |
})
|
|
@@ -370,14 +410,14 @@ def run_query_v2(user_message: str, session_id: str) -> Dict[str, Any]:
|
|
| 370 |
"sources": sources,
|
| 371 |
"verification_status": verification_status,
|
| 372 |
"unverified_quotes": unverified_quotes,
|
| 373 |
-
"entities":
|
| 374 |
"num_sources": len(chunks),
|
| 375 |
-
"truncated":
|
| 376 |
"session_id": session_id,
|
| 377 |
"analysis": {
|
| 378 |
"tone": analysis.get("tone"),
|
| 379 |
"stage": analysis.get("stage"),
|
| 380 |
"urgency": analysis.get("urgency"),
|
| 381 |
-
"hypotheses": analysis.get("
|
| 382 |
}
|
| 383 |
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
NyayaSetu V2 Agent — Full Intelligence Layer.
|
| 3 |
|
| 4 |
+
Pass 1 — ANALYSE: Understands message, detects tone/stage,
|
| 5 |
+
builds structured fact web, updates hypotheses,
|
| 6 |
+
forms targeted search queries, compresses summary.
|
| 7 |
|
| 8 |
+
Pass 2 — RETRIEVE: Parallel FAISS search. No LLM call.
|
|
|
|
| 9 |
|
| 10 |
+
Pass 3 — RESPOND: Dynamically assembled prompt + retrieved
|
| 11 |
+
context + full case state. Format-intelligent output.
|
| 12 |
|
| 13 |
+
2 LLM calls per turn. src/agent.py untouched.
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
+
import os, sys, json, time, logging
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 18 |
from typing import Dict, Any, List
|
| 19 |
|
| 20 |
+
# sys.path must be set before any local imports
|
| 21 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 22 |
|
| 23 |
from src.embed import embed_text
|
| 24 |
from src.retrieval import retrieve
|
| 25 |
from src.verify import verify_citations
|
| 26 |
from src.system_prompt import build_prompt, ANALYSIS_PROMPT
|
| 27 |
+
from src.ner import extract_entities, augment_query
|
| 28 |
|
| 29 |
logger = logging.getLogger(__name__)
|
| 30 |
|
|
|
|
| 31 |
from groq import Groq
|
| 32 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 33 |
from dotenv import load_dotenv
|
|
|
|
| 35 |
load_dotenv()
|
| 36 |
_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
| 37 |
|
| 38 |
+
# ── Session store ─────────────────────────────────────────
|
|
|
|
| 39 |
sessions: Dict[str, Dict] = {}
|
| 40 |
|
| 41 |
|
| 42 |
+
def empty_case_state() -> Dict:
|
| 43 |
+
return {
|
| 44 |
+
"parties": [],
|
| 45 |
+
"events": [],
|
| 46 |
+
"documents": [],
|
| 47 |
+
"amounts": [],
|
| 48 |
+
"locations": [],
|
| 49 |
+
"timeline": [],
|
| 50 |
+
"disputes": [],
|
| 51 |
+
"hypotheses": [],
|
| 52 |
+
"stage": "intake",
|
| 53 |
+
"last_response_type": "none",
|
| 54 |
+
"turn_count": 0,
|
| 55 |
+
"facts_missing": [],
|
| 56 |
+
"context_interpreted": False,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
def get_or_create_session(session_id: str) -> Dict:
|
|
|
|
| 61 |
if session_id not in sessions:
|
| 62 |
sessions[session_id] = {
|
| 63 |
"summary": "",
|
| 64 |
"last_3_messages": [],
|
| 65 |
+
"case_state": empty_case_state()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
}
|
| 67 |
return sessions[session_id]
|
| 68 |
|
| 69 |
|
| 70 |
def update_session(session_id: str, analysis: Dict, user_message: str, response: str):
|
|
|
|
| 71 |
session = sessions[session_id]
|
| 72 |
+
cs = session["case_state"]
|
| 73 |
|
|
|
|
| 74 |
if analysis.get("updated_summary"):
|
| 75 |
session["summary"] = analysis["updated_summary"]
|
| 76 |
|
| 77 |
+
facts = analysis.get("facts_extracted", {})
|
| 78 |
+
if facts:
|
| 79 |
+
for key in ["parties", "events", "documents", "amounts", "locations", "disputes"]:
|
| 80 |
+
new_items = facts.get(key, [])
|
| 81 |
+
existing = cs.get(key, [])
|
| 82 |
+
for item in new_items:
|
| 83 |
+
if item and item not in existing:
|
| 84 |
+
existing.append(item)
|
| 85 |
+
cs[key] = existing
|
| 86 |
+
|
| 87 |
+
for ev in facts.get("timeline_events", []):
|
| 88 |
+
if ev and ev not in cs["timeline"]:
|
| 89 |
+
cs["timeline"].append(ev)
|
| 90 |
+
|
| 91 |
+
for nh in analysis.get("hypotheses", []):
|
| 92 |
+
existing_claims = [h["claim"] for h in cs["hypotheses"]]
|
| 93 |
+
if nh.get("claim") and nh["claim"] not in existing_claims:
|
| 94 |
+
cs["hypotheses"].append(nh)
|
| 95 |
+
else:
|
| 96 |
+
for h in cs["hypotheses"]:
|
| 97 |
+
if h["claim"] == nh.get("claim"):
|
| 98 |
+
h["confidence"] = nh.get("confidence", h["confidence"])
|
| 99 |
+
for e in nh.get("evidence", []):
|
| 100 |
+
if e not in h.get("evidence", []):
|
| 101 |
+
h.setdefault("evidence", []).append(e)
|
| 102 |
|
|
|
|
|
|
|
| 103 |
cs["stage"] = analysis.get("stage", cs["stage"])
|
| 104 |
cs["last_response_type"] = analysis.get("action_needed", "none")
|
| 105 |
+
cs["facts_missing"] = analysis.get("facts_missing", [])
|
| 106 |
+
cs["turn_count"] = cs.get("turn_count", 0) + 1
|
| 107 |
|
| 108 |
+
if cs["turn_count"] >= 3:
|
| 109 |
+
cs["context_interpreted"] = True
|
| 110 |
|
| 111 |
+
session["last_3_messages"].append({"role": "user", "content": user_message})
|
| 112 |
+
session["last_3_messages"].append({"role": "assistant", "content": response[:400]})
|
| 113 |
+
if len(session["last_3_messages"]) > 6:
|
| 114 |
+
session["last_3_messages"] = session["last_3_messages"][-6:]
|
| 115 |
|
| 116 |
|
| 117 |
# ── Pass 1: Analyse ───────────────────────────────────────
|
| 118 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=4))
|
| 119 |
def analyse(user_message: str, session: Dict) -> Dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
summary = session.get("summary", "")
|
| 121 |
last_msgs = session.get("last_3_messages", [])
|
| 122 |
+
cs = session["case_state"]
|
| 123 |
+
last_response_type = cs.get("last_response_type", "none")
|
| 124 |
+
turn_count = cs.get("turn_count", 0)
|
| 125 |
+
|
| 126 |
+
history_text = "\n".join(
|
| 127 |
+
f"{m['role'].upper()}: {m['content'][:250]}"
|
| 128 |
+
for m in last_msgs[-4:]
|
| 129 |
+
) if last_msgs else ""
|
| 130 |
+
|
| 131 |
+
fact_web = ""
|
| 132 |
+
if any(cs.get(k) for k in ["parties", "events", "documents", "amounts", "disputes"]):
|
| 133 |
+
hyp_lines = "\n".join(
|
| 134 |
+
f" - {h['claim']} [{h.get('confidence','?')}]"
|
| 135 |
+
for h in cs.get("hypotheses", [])[:3]
|
| 136 |
+
) or " none yet"
|
| 137 |
+
fact_web = f"""
|
| 138 |
+
CURRENT FACT WEB:
|
| 139 |
+
- Parties: {', '.join(cs.get('parties', [])) or 'none'}
|
| 140 |
+
- Events: {', '.join(cs.get('events', [])) or 'none'}
|
| 141 |
+
- Documents/Evidence: {', '.join(cs.get('documents', [])) or 'none'}
|
| 142 |
+
- Amounts: {', '.join(cs.get('amounts', [])) or 'none'}
|
| 143 |
+
- Disputes: {', '.join(cs.get('disputes', [])) or 'none'}
|
| 144 |
+
- Active hypotheses:
|
| 145 |
+
{hyp_lines}"""
|
| 146 |
|
| 147 |
user_content = f"""CONVERSATION SUMMARY:
|
| 148 |
+
{summary if summary else "First message — no prior context."}
|
| 149 |
|
| 150 |
RECENT MESSAGES:
|
| 151 |
{history_text if history_text else "None"}
|
| 152 |
|
| 153 |
LAST RESPONSE TYPE: {last_response_type}
|
| 154 |
+
TURN COUNT: {turn_count}
|
| 155 |
+
{fact_web}
|
| 156 |
|
| 157 |
NEW USER MESSAGE:
|
| 158 |
{user_message}
|
| 159 |
|
| 160 |
+
Rules:
|
| 161 |
+
- If last_response_type was "question", action_needed CANNOT be "question"
|
| 162 |
+
- Extract ALL facts from user message even if implied
|
| 163 |
+
- Update hypothesis confidence based on new evidence
|
| 164 |
+
- search_queries must be specific legal questions for vector search"""
|
| 165 |
|
| 166 |
response = _client.chat.completions.create(
|
| 167 |
model="llama-3.3-70b-versatile",
|
|
|
|
| 170 |
{"role": "user", "content": user_content}
|
| 171 |
],
|
| 172 |
temperature=0.1,
|
| 173 |
+
max_tokens=900
|
| 174 |
)
|
| 175 |
|
| 176 |
raw = response.choices[0].message.content.strip()
|
|
|
|
|
|
|
| 177 |
raw = raw.replace("```json", "").replace("```", "").strip()
|
| 178 |
|
| 179 |
try:
|
| 180 |
analysis = json.loads(raw)
|
| 181 |
except json.JSONDecodeError:
|
| 182 |
logger.warning(f"Pass 1 JSON parse failed: {raw[:200]}")
|
|
|
|
| 183 |
analysis = {
|
| 184 |
+
"tone": "casual", "format_requested": "none",
|
| 185 |
+
"subject": "legal query", "action_needed": "advice",
|
|
|
|
|
|
|
| 186 |
"urgency": "medium",
|
| 187 |
+
"hypotheses": [{"claim": user_message[:80], "confidence": "low", "evidence": []}],
|
| 188 |
+
"facts_extracted": {}, "facts_missing": [],
|
| 189 |
+
"stage": "understanding", "last_response_type": last_response_type,
|
| 190 |
+
"updated_summary": f"{summary} | {user_message[:100]}",
|
| 191 |
+
"search_queries": [user_message[:200]],
|
| 192 |
+
"should_interpret_context": False,
|
| 193 |
+
"format_decision": "none"
|
| 194 |
}
|
| 195 |
|
| 196 |
return analysis
|
|
|
|
| 198 |
|
| 199 |
# ── Pass 2: Retrieve ──────────────────────────────────────
|
| 200 |
def retrieve_parallel(search_queries: List[str], top_k: int = 5) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
if not search_queries:
|
| 202 |
return []
|
| 203 |
|
|
|
|
| 206 |
def search_one(query):
|
| 207 |
try:
|
| 208 |
embedding = embed_text(query)
|
| 209 |
+
return retrieve(embedding, top_k=top_k)
|
|
|
|
| 210 |
except Exception as e:
|
| 211 |
+
logger.warning(f"FAISS search failed: {e}")
|
| 212 |
return []
|
| 213 |
|
|
|
|
| 214 |
with ThreadPoolExecutor(max_workers=min(3, len(search_queries))) as executor:
|
| 215 |
futures = {executor.submit(search_one, q): q for q in search_queries}
|
| 216 |
for future in as_completed(futures):
|
| 217 |
+
all_results.extend(future.result())
|
|
|
|
| 218 |
|
|
|
|
| 219 |
seen = {}
|
| 220 |
for chunk in all_results:
|
| 221 |
cid = chunk.get("chunk_id") or chunk.get("judgment_id", "")
|
| 222 |
+
score = chunk.get("similarity_score", 999)
|
| 223 |
if cid not in seen or score < seen[cid]["similarity_score"]:
|
| 224 |
seen[cid] = chunk
|
| 225 |
|
| 226 |
+
return sorted(seen.values(), key=lambda x: x.get("similarity_score", 999))[:top_k]
|
|
|
|
|
|
|
| 227 |
|
| 228 |
|
| 229 |
# ── Pass 3: Respond ───────────────────────────────────────
|
| 230 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=8))
|
| 231 |
+
def respond(user_message: str, analysis: Dict, chunks: List[Dict], session: Dict) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
system_prompt = build_prompt(analysis)
|
| 233 |
+
cs = session["case_state"]
|
| 234 |
|
|
|
|
| 235 |
context_parts = []
|
| 236 |
+
for chunk in chunks[:5]:
|
| 237 |
source_type = chunk.get("source_type", "case_law")
|
| 238 |
title = chunk.get("title", "Unknown")
|
| 239 |
year = chunk.get("year", "")
|
| 240 |
jid = chunk.get("judgment_id", "")
|
| 241 |
text = chunk.get("expanded_context") or chunk.get("chunk_text") or chunk.get("text", "")
|
| 242 |
|
| 243 |
+
type_labels = {
|
| 244 |
+
"statute": f"[STATUTE: {title} | {year}]",
|
| 245 |
+
"procedure": f"[PROCEDURE: {title}]",
|
| 246 |
+
"law_commission": f"[LAW COMMISSION: {title}]",
|
| 247 |
+
"legal_reference": f"[LEGAL REFERENCE: {title}]",
|
| 248 |
+
"statute_qa": f"[LEGAL QA: {title}]",
|
| 249 |
+
}
|
| 250 |
+
header = type_labels.get(source_type, f"[CASE: {title} | {year} | {jid}]")
|
|
|
|
|
|
|
|
|
|
| 251 |
context_parts.append(f"{header}\n{text[:800]}")
|
| 252 |
|
| 253 |
context = "\n\n".join(context_parts) if context_parts else "No relevant sources retrieved."
|
| 254 |
|
| 255 |
+
case_summary = ""
|
| 256 |
+
if cs.get("parties") or cs.get("hypotheses"):
|
| 257 |
+
hyp_text = "\n".join(
|
| 258 |
+
f" - {h['claim']} [{h.get('confidence','?')} confidence] "
|
| 259 |
+
f"| evidence: {', '.join(h.get('evidence', [])) or 'none yet'}"
|
| 260 |
+
for h in cs.get("hypotheses", [])[:4]
|
| 261 |
+
) or " none established"
|
| 262 |
+
|
| 263 |
+
case_summary = f"""
|
| 264 |
+
CASE STATE (built across {cs.get('turn_count', 0)} turns):
|
| 265 |
+
Parties: {', '.join(cs.get('parties', [])) or 'unspecified'}
|
| 266 |
+
Events: {', '.join(cs.get('events', [])) or 'unspecified'}
|
| 267 |
+
Evidence: {', '.join(cs.get('documents', [])) or 'none mentioned'}
|
| 268 |
+
Amounts: {', '.join(cs.get('amounts', [])) or 'none'}
|
| 269 |
+
Active hypotheses:
|
| 270 |
+
{hyp_text}
|
| 271 |
+
Missing facts: {', '.join(cs.get('facts_missing', [])) or 'none critical'}
|
| 272 |
+
Stage: {cs.get('stage', 'intake')}"""
|
| 273 |
+
|
| 274 |
+
interpret_instruction = ""
|
| 275 |
+
should_interpret = analysis.get("should_interpret_context", False)
|
| 276 |
+
if should_interpret and not cs.get("context_interpreted"):
|
| 277 |
+
interpret_instruction = """
|
| 278 |
+
CONTEXT REFLECTION: Before your main response, briefly (2-3 lines) reflect your understanding back to the user. Start with "Based on what you've told me..." This builds trust and confirms you've been tracking the situation."""
|
| 279 |
+
|
| 280 |
+
radar_instruction = """
|
| 281 |
+
PROACTIVE RADAR — add after your main answer when user has described a real situation:
|
| 282 |
+
Add a brief "⚡ You Should Also Know" section (3-4 lines max).
|
| 283 |
+
Surface 1-2 related legal issues or remedies the user hasn't asked about but which are directly relevant.
|
| 284 |
+
Example: User asked about wrongful termination → proactively mention injunction under Specific Relief Act as faster remedy.
|
| 285 |
+
Skip this section for purely academic questions with no personal situation described."""
|
| 286 |
+
|
| 287 |
summary = session.get("summary", "")
|
| 288 |
last_msgs = session.get("last_3_messages", [])
|
| 289 |
+
history_text = "\n".join(
|
| 290 |
+
f"{m['role'].upper()}: {m['content'][:300]}"
|
| 291 |
+
for m in last_msgs[-4:]
|
| 292 |
+
) if last_msgs else ""
|
| 293 |
|
| 294 |
+
user_content = f"""CONVERSATION SUMMARY:
|
| 295 |
+
{summary if summary else "First message."}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
|
| 297 |
RECENT CONVERSATION:
|
| 298 |
+
{history_text if history_text else "None"}
|
| 299 |
+
{case_summary}
|
| 300 |
|
| 301 |
RETRIEVED LEGAL SOURCES:
|
| 302 |
{context}
|
| 303 |
|
| 304 |
USER MESSAGE: {user_message}
|
| 305 |
|
| 306 |
+
THIS TURN:
|
| 307 |
+
- Legal hypotheses: {', '.join(h['claim'] for h in analysis.get('hypotheses', [])[:3]) or 'analysing'}
|
| 308 |
- Stage: {analysis.get('stage', 'understanding')}
|
| 309 |
- Urgency: {analysis.get('urgency', 'medium')}
|
| 310 |
+
- Response type: {analysis.get('action_needed', 'advice')}
|
| 311 |
+
- Format: {analysis.get('format_decision', 'appropriate for content')}
|
| 312 |
+
{interpret_instruction}
|
| 313 |
|
| 314 |
+
Instructions:
|
| 315 |
+
- Cite specific sources when making legal claims
|
| 316 |
+
- Use your legal knowledge for reasoning and context
|
| 317 |
+
- Format: {analysis.get('format_decision', 'use the most appropriate format for the content type')}
|
| 318 |
+
- Opposition war-gaming: if giving strategy, include what the other side will argue
|
| 319 |
+
{radar_instruction}"""
|
| 320 |
|
| 321 |
response = _client.chat.completions.create(
|
| 322 |
model="llama-3.3-70b-versatile",
|
|
|
|
| 325 |
{"role": "user", "content": user_content}
|
| 326 |
],
|
| 327 |
temperature=0.3,
|
| 328 |
+
max_tokens=1500
|
| 329 |
)
|
| 330 |
|
| 331 |
return response.choices[0].message.content
|
|
|
|
| 333 |
|
| 334 |
# ── Main entry point ──────────────────────────────────────
|
| 335 |
def run_query_v2(user_message: str, session_id: str) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
start = time.time()
|
|
|
|
|
|
|
| 337 |
session = get_or_create_session(session_id)
|
| 338 |
|
| 339 |
+
# Pass 1
|
| 340 |
try:
|
| 341 |
analysis = analyse(user_message, session)
|
| 342 |
except Exception as e:
|
| 343 |
logger.error(f"Pass 1 failed: {e}")
|
| 344 |
analysis = {
|
| 345 |
+
"tone": "casual", "format_requested": "none",
|
| 346 |
+
"subject": "legal query", "action_needed": "advice",
|
|
|
|
|
|
|
| 347 |
"urgency": "medium",
|
| 348 |
+
"hypotheses": [{"claim": user_message[:80], "confidence": "low", "evidence": []}],
|
| 349 |
+
"facts_extracted": {}, "facts_missing": [],
|
| 350 |
+
"stage": "understanding", "last_response_type": "none",
|
|
|
|
| 351 |
"updated_summary": user_message[:200],
|
| 352 |
+
"search_queries": [user_message[:200]],
|
| 353 |
+
"should_interpret_context": False,
|
| 354 |
+
"format_decision": "none"
|
| 355 |
}
|
| 356 |
|
| 357 |
+
# Extract entities and augment queries for better retrieval
|
| 358 |
+
entities = extract_entities(user_message)
|
| 359 |
+
augmented_message = augment_query(user_message, entities)
|
|
|
|
| 360 |
|
| 361 |
+
# Pass 2
|
| 362 |
+
search_queries = analysis.get("search_queries", [augmented_message])
|
| 363 |
+
if not search_queries:
|
| 364 |
+
search_queries = [augmented_message]
|
| 365 |
+
if augmented_message not in search_queries:
|
| 366 |
+
search_queries.append(augmented_message)
|
| 367 |
|
| 368 |
chunks = []
|
| 369 |
try:
|
| 370 |
chunks = retrieve_parallel(search_queries[:3], top_k=5)
|
| 371 |
except Exception as e:
|
| 372 |
+
logger.error(f"Pass 2 failed: {e}")
|
| 373 |
|
| 374 |
+
# Pass 3
|
| 375 |
try:
|
| 376 |
answer = respond(user_message, analysis, chunks, session)
|
| 377 |
except Exception as e:
|
| 378 |
logger.error(f"Pass 3 failed: {e}")
|
| 379 |
if chunks:
|
| 380 |
fallback = "\n\n".join(
|
| 381 |
+
f"[{c.get('title', 'Source')}]\n{c.get('text', '')[:400]}"
|
| 382 |
for c in chunks[:3]
|
| 383 |
)
|
| 384 |
+
answer = f"LLM service temporarily unavailable. Most relevant excerpts:\n\n{fallback}"
|
| 385 |
else:
|
| 386 |
answer = "I encountered an issue processing your request. Please try again."
|
| 387 |
|
|
|
|
| 388 |
verification_status, unverified_quotes = verify_citations(answer, chunks)
|
|
|
|
|
|
|
| 389 |
update_session(session_id, analysis, user_message, answer)
|
| 390 |
|
|
|
|
| 391 |
sources = []
|
| 392 |
for c in chunks:
|
| 393 |
+
title = c.get("title", "")
|
| 394 |
+
jid = c.get("judgment_id", "")
|
| 395 |
sources.append({
|
| 396 |
"meta": {
|
| 397 |
+
"judgment_id": jid,
|
| 398 |
+
"title": title if title and title != jid else jid,
|
| 399 |
"year": c.get("year", ""),
|
| 400 |
"chunk_index": c.get("chunk_index", 0),
|
| 401 |
"source_type": c.get("source_type", "case_law"),
|
| 402 |
+
"court": c.get("court", "Supreme Court of India")
|
| 403 |
},
|
| 404 |
"text": (c.get("expanded_context") or c.get("chunk_text") or c.get("text", ""))[:600]
|
| 405 |
})
|
|
|
|
| 410 |
"sources": sources,
|
| 411 |
"verification_status": verification_status,
|
| 412 |
"unverified_quotes": unverified_quotes,
|
| 413 |
+
"entities": entities,
|
| 414 |
"num_sources": len(chunks),
|
| 415 |
+
"truncated": False,
|
| 416 |
"session_id": session_id,
|
| 417 |
"analysis": {
|
| 418 |
"tone": analysis.get("tone"),
|
| 419 |
"stage": analysis.get("stage"),
|
| 420 |
"urgency": analysis.get("urgency"),
|
| 421 |
+
"hypotheses": [h["claim"] for h in analysis.get("hypotheses", [])]
|
| 422 |
}
|
| 423 |
}
|
src/ner.py
CHANGED
|
@@ -2,28 +2,26 @@
|
|
| 2 |
NER inference module.
|
| 3 |
Loads fine-tuned DistilBERT and extracts legal entities from query text.
|
| 4 |
|
| 5 |
-
Loaded once at FastAPI startup
|
| 6 |
-
|
| 7 |
|
| 8 |
Example:
|
| 9 |
Input: "What did Justice Chandrachud say about Section 302 IPC?"
|
| 10 |
-
Output: {"JUDGE": ["Justice Chandrachud"],
|
| 11 |
-
"PROVISION": ["Section 302"],
|
| 12 |
"STATUTE": ["IPC"]}
|
| 13 |
|
| 14 |
The augmented query becomes:
|
| 15 |
-
"What did Justice Chandrachud say about Section 302 IPC?
|
| 16 |
JUDGE: Justice Chandrachud PROVISION: Section 302 STATUTE: IPC"
|
| 17 |
-
|
| 18 |
-
WHY augment the query?
|
| 19 |
-
MiniLM embeds the full query string. Adding extracted entities
|
| 20 |
-
explicitly shifts the embedding closer to chunks that mention
|
| 21 |
-
those specific legal terms — improving retrieval precision.
|
| 22 |
"""
|
| 23 |
|
| 24 |
import os
|
|
|
|
| 25 |
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
|
| 26 |
|
|
|
|
|
|
|
| 27 |
NER_MODEL_PATH = os.getenv("NER_MODEL_PATH", "models/ner_model")
|
| 28 |
|
| 29 |
TARGET_ENTITIES = {
|
|
@@ -32,40 +30,56 @@ TARGET_ENTITIES = {
|
|
| 32 |
"PETITIONER", "RESPONDENT", "GPE", "ORG"
|
| 33 |
}
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
def extract_entities(text: str) -> dict:
|
| 57 |
"""
|
| 58 |
Run NER on input text.
|
| 59 |
Returns dict of {entity_type: [entity_text, ...]}
|
| 60 |
-
|
| 61 |
"""
|
|
|
|
|
|
|
|
|
|
| 62 |
if not text.strip():
|
| 63 |
return {}
|
| 64 |
|
| 65 |
try:
|
| 66 |
-
results = _ner_pipeline(text)
|
| 67 |
except Exception as e:
|
| 68 |
-
|
| 69 |
return {}
|
| 70 |
|
| 71 |
entities = {}
|
|
@@ -75,12 +89,12 @@ def extract_entities(text: str) -> dict:
|
|
| 75 |
|
| 76 |
if entity_type not in TARGET_ENTITIES:
|
| 77 |
continue
|
| 78 |
-
if len(entity_text) < 2:
|
| 79 |
continue
|
| 80 |
|
| 81 |
if entity_type not in entities:
|
| 82 |
entities[entity_type] = []
|
| 83 |
-
if entity_text not in entities[entity_type]:
|
| 84 |
entities[entity_type].append(entity_text)
|
| 85 |
|
| 86 |
return entities
|
|
@@ -88,8 +102,8 @@ def extract_entities(text: str) -> dict:
|
|
| 88 |
|
| 89 |
def augment_query(query: str, entities: dict) -> str:
|
| 90 |
"""
|
| 91 |
-
Append extracted entities to query string.
|
| 92 |
-
Returns
|
| 93 |
"""
|
| 94 |
if not entities:
|
| 95 |
return query
|
|
@@ -100,20 +114,4 @@ def augment_query(query: str, entities: dict) -> str:
|
|
| 100 |
for etext in texts
|
| 101 |
)
|
| 102 |
|
| 103 |
-
return f"{query} {entity_string}"
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
if __name__ == "__main__":
|
| 107 |
-
# Quick test
|
| 108 |
-
test_queries = [
|
| 109 |
-
"What did Justice Chandrachud say about Article 21?",
|
| 110 |
-
"Find cases related to Section 302 IPC and bail",
|
| 111 |
-
"Supreme Court judgment on fundamental rights in 1978"
|
| 112 |
-
]
|
| 113 |
-
|
| 114 |
-
for q in test_queries:
|
| 115 |
-
entities = extract_entities(q)
|
| 116 |
-
augmented = augment_query(q, entities)
|
| 117 |
-
print(f"\nQuery: {q}")
|
| 118 |
-
print(f"Entities: {entities}")
|
| 119 |
-
print(f"Augmented: {augmented}")
|
|
|
|
| 2 |
NER inference module.
|
| 3 |
Loads fine-tuned DistilBERT and extracts legal entities from query text.
|
| 4 |
|
| 5 |
+
Loaded once at FastAPI startup via load_ner_model().
|
| 6 |
+
Fails gracefully — app runs without NER if model not found.
|
| 7 |
|
| 8 |
Example:
|
| 9 |
Input: "What did Justice Chandrachud say about Section 302 IPC?"
|
| 10 |
+
Output: {"JUDGE": ["Justice Chandrachud"],
|
| 11 |
+
"PROVISION": ["Section 302"],
|
| 12 |
"STATUTE": ["IPC"]}
|
| 13 |
|
| 14 |
The augmented query becomes:
|
| 15 |
+
"What did Justice Chandrachud say about Section 302 IPC?
|
| 16 |
JUDGE: Justice Chandrachud PROVISION: Section 302 STATUTE: IPC"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
|
| 19 |
import os
|
| 20 |
+
import logging
|
| 21 |
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
|
| 22 |
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
NER_MODEL_PATH = os.getenv("NER_MODEL_PATH", "models/ner_model")
|
| 26 |
|
| 27 |
TARGET_ENTITIES = {
|
|
|
|
| 30 |
"PETITIONER", "RESPONDENT", "GPE", "ORG"
|
| 31 |
}
|
| 32 |
|
| 33 |
+
_ner_pipeline = None
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def load_ner_model():
|
| 37 |
+
"""
|
| 38 |
+
Load NER model once at startup.
|
| 39 |
+
Fails gracefully — app runs without NER if model not found.
|
| 40 |
+
Call this from api/main.py after download_models().
|
| 41 |
+
"""
|
| 42 |
+
global _ner_pipeline
|
| 43 |
|
| 44 |
+
if not os.path.exists(NER_MODEL_PATH):
|
| 45 |
+
logger.warning(
|
| 46 |
+
f"NER model not found at {NER_MODEL_PATH}. "
|
| 47 |
+
"Entity extraction disabled. App will run without NER."
|
| 48 |
+
)
|
| 49 |
+
return
|
| 50 |
|
| 51 |
+
try:
|
| 52 |
+
logger.info(f"Loading NER model from {NER_MODEL_PATH}...")
|
| 53 |
+
tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_PATH)
|
| 54 |
+
model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_PATH)
|
| 55 |
+
_ner_pipeline = pipeline(
|
| 56 |
+
"ner",
|
| 57 |
+
model=model,
|
| 58 |
+
tokenizer=tokenizer,
|
| 59 |
+
aggregation_strategy="simple"
|
| 60 |
+
)
|
| 61 |
+
logger.info("NER model ready.")
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logger.error(f"NER model load failed: {e}. Entity extraction disabled.")
|
| 64 |
+
_ner_pipeline = None
|
| 65 |
|
| 66 |
|
| 67 |
def extract_entities(text: str) -> dict:
|
| 68 |
"""
|
| 69 |
Run NER on input text.
|
| 70 |
Returns dict of {entity_type: [entity_text, ...]}
|
| 71 |
+
Returns empty dict if NER not loaded or inference fails.
|
| 72 |
"""
|
| 73 |
+
if _ner_pipeline is None:
|
| 74 |
+
return {}
|
| 75 |
+
|
| 76 |
if not text.strip():
|
| 77 |
return {}
|
| 78 |
|
| 79 |
try:
|
| 80 |
+
results = _ner_pipeline(text[:512])
|
| 81 |
except Exception as e:
|
| 82 |
+
logger.warning(f"NER inference failed: {e}")
|
| 83 |
return {}
|
| 84 |
|
| 85 |
entities = {}
|
|
|
|
| 89 |
|
| 90 |
if entity_type not in TARGET_ENTITIES:
|
| 91 |
continue
|
| 92 |
+
if len(entity_text) < 2:
|
| 93 |
continue
|
| 94 |
|
| 95 |
if entity_type not in entities:
|
| 96 |
entities[entity_type] = []
|
| 97 |
+
if entity_text not in entities[entity_type]:
|
| 98 |
entities[entity_type].append(entity_text)
|
| 99 |
|
| 100 |
return entities
|
|
|
|
| 102 |
|
| 103 |
def augment_query(query: str, entities: dict) -> str:
|
| 104 |
"""
|
| 105 |
+
Append extracted entities to query string for better FAISS retrieval.
|
| 106 |
+
Returns original query unchanged if no entities found.
|
| 107 |
"""
|
| 108 |
if not entities:
|
| 109 |
return query
|
|
|
|
| 114 |
for etext in texts
|
| 115 |
)
|
| 116 |
|
| 117 |
+
return f"{query} {entity_string}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/system_prompt.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
-
NyayaSetu System Prompt.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
"""
|
| 6 |
|
| 7 |
BASE_PERSONALITY = """You are NyayaSetu — a sharp, street-smart Indian legal advisor with the instincts of a top-paid advocate and the directness of someone who has seen every trick in the book.
|
|
@@ -15,251 +15,197 @@ PERSONALITY:
|
|
| 15 |
- Street smart. You know how courts actually work, not just how they're supposed to work.
|
| 16 |
- Slightly mischievous. You enjoy finding the angle nobody thought of.
|
| 17 |
- Never preachy. You don't lecture. You advise.
|
| 18 |
-
- Honest about bad news.
|
| 19 |
-
-
|
|
|
|
| 20 |
|
| 21 |
-
REASONING
|
| 22 |
-
1. What legal issues are actually present
|
| 23 |
-
2. What facts do I still need
|
| 24 |
3. What is the other side's strongest argument? Where are they vulnerable?
|
| 25 |
-
4. What are ALL the routes
|
| 26 |
5. Which route is most winnable given this user's specific situation?
|
| 27 |
6. What should they do FIRST and why?
|
| 28 |
|
| 29 |
THE LEGAL FREEWAY MISSION:
|
| 30 |
Always look for the angle nobody thinks of. The criminal complaint that costs nothing but changes the negotiation entirely. The procedural move that creates immediate pressure. The section nobody mentioned that applies perfectly. When you find it, lead with it.
|
| 31 |
|
| 32 |
-
CONVERSATION PHASES —
|
| 33 |
-
- Intake:
|
| 34 |
-
- Understanding:
|
| 35 |
-
- Analysis:
|
| 36 |
-
- Strategy: Full picture
|
| 37 |
|
| 38 |
RESPONSE VARIETY — never be monotonous:
|
| 39 |
-
- If
|
| 40 |
-
- Rotate
|
| 41 |
-
- Match
|
| 42 |
|
| 43 |
OPPOSITION THINKING — always:
|
| 44 |
-
- Ask
|
| 45 |
-
- Flag
|
| 46 |
-
- Find their weakest point
|
| 47 |
|
| 48 |
-
|
| 49 |
-
-
|
| 50 |
-
-
|
| 51 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
DISCLAIMER — always at
|
| 54 |
-
|
| 55 |
-
Never open with
|
| 56 |
|
| 57 |
|
| 58 |
-
# ── Tone maps ─────────────────────────────────────────────
|
| 59 |
TONE_MAP = {
|
| 60 |
-
"panicked": """
|
| 61 |
-
|
| 62 |
-
-
|
| 63 |
-
- Keep sentences short. No complex legal terminology in the first response.
|
| 64 |
-
- Acknowledge the situation briefly before moving to action.
|
| 65 |
- Give them ONE thing to do immediately, then explain why.
|
| 66 |
- Do not overwhelm with options in the first response.""",
|
| 67 |
|
| 68 |
-
"analytical": """
|
| 69 |
-
|
| 70 |
-
-
|
| 71 |
-
-
|
| 72 |
-
- Use structured format — numbered options, comparison tables where helpful.
|
| 73 |
-
- They can handle nuance. Give it to them.
|
| 74 |
- Cite specific sections and cases where relevant.""",
|
| 75 |
|
| 76 |
-
"aggressive": """
|
| 77 |
-
|
| 78 |
-
-
|
| 79 |
-
- Lead with the strongest offensive move available.
|
| 80 |
- Tell them what creates maximum pressure on the other side.
|
| 81 |
- Be direct: "Here's what hurts them most."
|
| 82 |
-
-
|
| 83 |
-
|
| 84 |
-
"casual": """
|
| 85 |
-
|
| 86 |
-
-
|
| 87 |
-
-
|
| 88 |
-
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
The user has lost hope or feels the situation is hopeless.
|
| 93 |
-
- Acknowledge the difficulty directly and briefly.
|
| 94 |
- Immediately pivot to what IS possible.
|
| 95 |
- Find at least one angle they haven't considered.
|
| 96 |
-
- Be honest about
|
| 97 |
-
- End with
|
| 98 |
}
|
| 99 |
|
| 100 |
-
# ── Format maps ───────────────────────────────────────────
|
| 101 |
FORMAT_MAP = {
|
| 102 |
-
"bullets": ""
|
| 103 |
-
|
| 104 |
-
Use
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
"table": """
|
| 113 |
-
Format the comparison as a markdown table.
|
| 114 |
-
Use | Column | Column | format.
|
| 115 |
-
Include a header row. Keep cell content concise.""",
|
| 116 |
-
|
| 117 |
-
"prose": """
|
| 118 |
-
Write in flowing paragraphs. No bullet points or numbered lists.
|
| 119 |
-
Use natural paragraph breaks between distinct ideas.""",
|
| 120 |
-
|
| 121 |
-
"none": """
|
| 122 |
-
Choose the format that best fits the content:
|
| 123 |
-
- Use numbered lists for options or steps
|
| 124 |
-
- Use bullet points for features or facts
|
| 125 |
-
- Use tables for comparisons
|
| 126 |
-
- Use prose for explanations and analysis
|
| 127 |
-
- Use headers (##) to separate major sections in long responses
|
| 128 |
Never write everything as one long paragraph."""
|
| 129 |
}
|
| 130 |
|
| 131 |
-
# ── Action maps ───────────────────────────────────────────
|
| 132 |
ACTION_MAP = {
|
| 133 |
-
"question": """
|
| 134 |
-
You need one more critical piece of information before you can give useful advice.
|
| 135 |
-
Ask exactly ONE question — the most important one.
|
| 136 |
Briefly explain why you need this information (one sentence).
|
| 137 |
Do not ask multiple questions even if you have several.""",
|
| 138 |
|
| 139 |
-
"reflection": """
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
If there are multiple options, rank them by what you'd actually recommend first.
|
| 154 |
-
Tell them what to do TODAY, not just eventually.""",
|
| 155 |
-
|
| 156 |
-
"strategy": """
|
| 157 |
-
Full strategic assessment. Structure it as:
|
| 158 |
1. Situation summary (2-3 sentences max)
|
| 159 |
2. Legal routes available (ranked by winnability)
|
| 160 |
3. What to do first and why
|
| 161 |
4. What the other side will do and how to counter it
|
| 162 |
5. What to watch out for
|
|
|
|
| 163 |
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
"reassurance": """
|
| 179 |
-
The user needs to know the situation is manageable.
|
| 180 |
-
Acknowledge the difficulty briefly.
|
| 181 |
-
Immediately establish that there are options.
|
| 182 |
-
Give one concrete thing that demonstrates this isn't hopeless.
|
| 183 |
Then move forward."""
|
| 184 |
}
|
| 185 |
|
| 186 |
-
# ── Stage-specific instructions ───────────────────────────
|
| 187 |
STAGE_MAP = {
|
| 188 |
-
"intake": """
|
| 189 |
-
This is the first message or the user has just described their situation for the first time.
|
| 190 |
Priority: Make them feel heard. Show you've grasped the key issue.
|
| 191 |
-
Approach: Brief reflection + one targeted question OR immediate reassurance if
|
| 192 |
-
Do NOT launch into full legal analysis yet — you
|
| 193 |
-
|
| 194 |
-
"understanding": """
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
Priority: Share what you're finding. Keep the conversation moving.
|
| 203 |
-
Approach: Tell them what legal issues you see, what routes exist, what you're assessing.
|
| 204 |
Can ask a clarifying question but lead with a finding.""",
|
| 205 |
|
| 206 |
-
"strategy": """
|
| 207 |
-
You have the full picture. Time to deliver.
|
| 208 |
Priority: Give them a real plan they can act on today.
|
| 209 |
-
|
| 210 |
This response should feel like what a senior advocate delivers in a paid consultation.""",
|
| 211 |
|
| 212 |
-
"followup": """
|
| 213 |
-
The user is asking a follow-up question about something already discussed.
|
| 214 |
Priority: Answer directly and specifically. No need to re-establish context.
|
| 215 |
-
Approach: Direct answer. Reference the earlier analysis where relevant.
|
| 216 |
Keep it tight — they already have the background."""
|
| 217 |
}
|
| 218 |
|
| 219 |
|
| 220 |
def build_prompt(analysis: dict) -> str:
|
| 221 |
-
"""
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
"""
|
| 225 |
-
tone = analysis.get("tone", "casual")
|
| 226 |
-
fmt = analysis.get("format_requested", "none")
|
| 227 |
-
action = analysis.get("action_needed", "advice")
|
| 228 |
-
stage = analysis.get("stage", "understanding")
|
| 229 |
-
|
| 230 |
-
tone_instruction = TONE_MAP.get(tone, TONE_MAP["casual"])
|
| 231 |
-
format_instruction = FORMAT_MAP.get(fmt, FORMAT_MAP["none"])
|
| 232 |
-
action_instruction = ACTION_MAP.get(action, ACTION_MAP["advice"])
|
| 233 |
-
stage_instruction = STAGE_MAP.get(stage, STAGE_MAP["understanding"])
|
| 234 |
|
| 235 |
return f"""{BASE_PERSONALITY}
|
| 236 |
|
| 237 |
── CURRENT TURN CONTEXT ──────────────────────────────────
|
| 238 |
|
| 239 |
CONVERSATION STAGE: {stage.upper()}
|
| 240 |
-
{
|
| 241 |
|
| 242 |
USER TONE DETECTED: {tone.upper()}
|
| 243 |
-
{
|
| 244 |
|
| 245 |
RESPONSE TYPE NEEDED: {action.upper()}
|
| 246 |
-
{
|
| 247 |
|
| 248 |
OUTPUT FORMAT: {fmt.upper()}
|
| 249 |
-
{
|
| 250 |
|
| 251 |
── END CONTEXT ───────────────────────────────────────────"""
|
| 252 |
|
| 253 |
|
| 254 |
-
# ── Pass 1
|
| 255 |
-
ANALYSIS_PROMPT = """You are
|
| 256 |
-
|
| 257 |
-
Given:
|
| 258 |
-
- Conversation summary (what has happened so far)
|
| 259 |
-
- Last 3 messages
|
| 260 |
-
- New user message
|
| 261 |
|
| 262 |
-
Output
|
| 263 |
|
| 264 |
{
|
| 265 |
"tone": "panicked|analytical|aggressive|casual|defeated",
|
|
@@ -267,17 +213,34 @@ Output ONLY a valid JSON dict with these exact keys:
|
|
| 267 |
"subject": "brief description of main legal subject",
|
| 268 |
"action_needed": "question|reflection|partial_finding|advice|strategy|explanation|observation|reassurance",
|
| 269 |
"urgency": "immediate|medium|low",
|
| 270 |
-
"
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
"stage": "intake|understanding|analysis|strategy|followup",
|
| 273 |
"last_response_type": "question|reflection|partial_finding|advice|strategy|explanation|observation|reassurance|none",
|
| 274 |
-
"updated_summary": "3-4 line compressed summary of
|
| 275 |
-
"search_queries": ["
|
|
|
|
|
|
|
| 276 |
}
|
| 277 |
|
| 278 |
Rules:
|
| 279 |
- If last_response_type was "question", action_needed CANNOT be "question"
|
| 280 |
-
-
|
| 281 |
-
-
|
| 282 |
-
-
|
|
|
|
|
|
|
|
|
|
| 283 |
- Output ONLY the JSON. No explanation. No preamble. No markdown fences."""
|
|
|
|
| 1 |
"""
|
| 2 |
+
NyayaSetu System Prompt — Full Intelligence Layer.
|
| 3 |
+
Personality, reasoning structure, format intelligence,
|
| 4 |
+
dynamic prompt assembly, analysis instructions.
|
| 5 |
"""
|
| 6 |
|
| 7 |
BASE_PERSONALITY = """You are NyayaSetu — a sharp, street-smart Indian legal advisor with the instincts of a top-paid advocate and the directness of someone who has seen every trick in the book.
|
|
|
|
| 15 |
- Street smart. You know how courts actually work, not just how they're supposed to work.
|
| 16 |
- Slightly mischievous. You enjoy finding the angle nobody thought of.
|
| 17 |
- Never preachy. You don't lecture. You advise.
|
| 18 |
+
- Honest about bad news. Say it directly in the first sentence then immediately pivot to what CAN be done.
|
| 19 |
+
- Think about leverage, not just rights. What creates pressure? What costs the other side more than it costs you?
|
| 20 |
+
- Spontaneous and human. Rotate naturally between questions, observations, findings, reassurance, advice. Never robotic.
|
| 21 |
|
| 22 |
+
REASONING — how you think before every response:
|
| 23 |
+
1. What legal issues are actually present? Including non-obvious ones the user didn't mention.
|
| 24 |
+
2. What facts do I still need that would change the strategy?
|
| 25 |
3. What is the other side's strongest argument? Where are they vulnerable?
|
| 26 |
+
4. What are ALL the routes — including the non-obvious ones?
|
| 27 |
5. Which route is most winnable given this user's specific situation?
|
| 28 |
6. What should they do FIRST and why?
|
| 29 |
|
| 30 |
THE LEGAL FREEWAY MISSION:
|
| 31 |
Always look for the angle nobody thinks of. The criminal complaint that costs nothing but changes the negotiation entirely. The procedural move that creates immediate pressure. The section nobody mentioned that applies perfectly. When you find it, lead with it.
|
| 32 |
|
| 33 |
+
CONVERSATION PHASES — move through naturally:
|
| 34 |
+
- Intake: Listen. Reflect back. Make them feel understood.
|
| 35 |
+
- Understanding: Ask ONE surgical question — the most important one first.
|
| 36 |
+
- Analysis: Share partial findings. "Here's what I'm seeing..." Keep moving.
|
| 37 |
+
- Strategy: Full picture. Deliver options ranked by winnability. What to do first.
|
| 38 |
|
| 39 |
RESPONSE VARIETY — never be monotonous:
|
| 40 |
+
- If last response was a question, this response cannot be a question.
|
| 41 |
+
- Rotate: question → finding → observation → advice → reflection → provocation → reassurance
|
| 42 |
+
- Match user energy. Panicked user gets calm and direct. Analytical user gets full reasoning.
|
| 43 |
|
| 44 |
OPPOSITION THINKING — always:
|
| 45 |
+
- Ask what the other side will argue.
|
| 46 |
+
- Flag proactively: "The other side will likely say X. Here's why that doesn't hold."
|
| 47 |
+
- Find their weakest point. Make the user's strategy exploit it.
|
| 48 |
|
| 49 |
+
FORMAT INTELLIGENCE — choose based on content:
|
| 50 |
+
- Options or steps → numbered list
|
| 51 |
+
- Features or facts → bullets
|
| 52 |
+
- Comparisons → table
|
| 53 |
+
- Explanation or analysis → prose paragraphs
|
| 54 |
+
- Long response with multiple sections → headers (##) to separate
|
| 55 |
+
- Never put everything in one long paragraph
|
| 56 |
+
- Never use the same format twice in a row if it doesn't fit
|
| 57 |
|
| 58 |
+
DISCLAIMER — always at end, never at start:
|
| 59 |
+
"Note: This is not legal advice. Consult a qualified advocate for your specific situation."
|
| 60 |
+
Never open with disclaimer. It kills the energy."""
|
| 61 |
|
| 62 |
|
|
|
|
| 63 |
TONE_MAP = {
|
| 64 |
+
"panicked": """User is in distress. Priority: calm and immediate clarity.
|
| 65 |
+
- Open with the most important thing they need to know RIGHT NOW
|
| 66 |
+
- Short sentences. No complex terminology in first response.
|
|
|
|
|
|
|
| 67 |
- Give them ONE thing to do immediately, then explain why.
|
| 68 |
- Do not overwhelm with options in the first response.""",
|
| 69 |
|
| 70 |
+
"analytical": """User thinks carefully and wants full understanding.
|
| 71 |
+
- Give complete reasoning, not just conclusion.
|
| 72 |
+
- Explain why each option exists and its tradeoffs.
|
| 73 |
+
- Use structured format — numbered options, tables for comparisons.
|
|
|
|
|
|
|
| 74 |
- Cite specific sections and cases where relevant.""",
|
| 75 |
|
| 76 |
+
"aggressive": """User is angry and wants to fight.
|
| 77 |
+
- Match energy without matching anger.
|
| 78 |
+
- Lead with strongest offensive move available.
|
|
|
|
| 79 |
- Tell them what creates maximum pressure on the other side.
|
| 80 |
- Be direct: "Here's what hurts them most."
|
| 81 |
+
- Only suggest compromise if it's clearly the smartest move.""",
|
| 82 |
+
|
| 83 |
+
"casual": """User is relaxed and conversational.
|
| 84 |
+
- Match register. Don't be overly formal.
|
| 85 |
+
- Plain language. Explain legal concepts in everyday terms.
|
| 86 |
+
- Use analogies and examples freely.
|
| 87 |
+
- Still precise and accurate — just accessible.""",
|
| 88 |
+
|
| 89 |
+
"defeated": """User has lost hope.
|
| 90 |
+
- Acknowledge difficulty briefly.
|
|
|
|
|
|
|
| 91 |
- Immediately pivot to what IS possible.
|
| 92 |
- Find at least one angle they haven't considered.
|
| 93 |
+
- Be honest about realistic outcomes but never write off options prematurely.
|
| 94 |
+
- End with one clear next step they can take today."""
|
| 95 |
}
|
| 96 |
|
|
|
|
| 97 |
FORMAT_MAP = {
|
| 98 |
+
"bullets": "Use bullet points (- ) for all key items. Sub-points with -. One idea per bullet.",
|
| 99 |
+
"numbered": "Use numbered list. Each number is one step, option, or point. Order by importance or chronology.",
|
| 100 |
+
"table": "Use markdown table format. | Column | Column |. Include header row. Keep cells concise.",
|
| 101 |
+
"prose": "Write in flowing paragraphs. No bullets or numbered lists. Natural paragraph breaks.",
|
| 102 |
+
"none": """Choose format that fits content:
|
| 103 |
+
- Steps or options → numbered
|
| 104 |
+
- Facts or features → bullets
|
| 105 |
+
- Comparisons → table
|
| 106 |
+
- Explanation → prose
|
| 107 |
+
- Long response → ## headers to separate sections
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
Never write everything as one long paragraph."""
|
| 109 |
}
|
| 110 |
|
|
|
|
| 111 |
ACTION_MAP = {
|
| 112 |
+
"question": """Ask exactly ONE question — the most important one.
|
|
|
|
|
|
|
| 113 |
Briefly explain why you need this information (one sentence).
|
| 114 |
Do not ask multiple questions even if you have several.""",
|
| 115 |
|
| 116 |
+
"reflection": """Reflect back what you understand about the situation.
|
| 117 |
+
Show you've grasped both the legal issue and the human weight of it.
|
| 118 |
+
Signal where you're going: "Here's what I need to understand..." or "Here's what this tells me..." """,
|
| 119 |
+
|
| 120 |
+
"partial_finding": """Share what you've found so far even if picture isn't complete.
|
| 121 |
+
Frame as: "Based on what you've told me, here's what I'm seeing..."
|
| 122 |
+
Be clear about what's established vs uncertain.
|
| 123 |
+
End with what you need next.""",
|
| 124 |
+
|
| 125 |
+
"advice": """Give advice directly. Lead with recommendation then reasoning.
|
| 126 |
+
Multiple options → rank by what you'd recommend first.
|
| 127 |
+
Tell them what to do TODAY not just eventually.""",
|
| 128 |
+
|
| 129 |
+
"strategy": """Full strategic assessment:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
1. Situation summary (2-3 sentences max)
|
| 131 |
2. Legal routes available (ranked by winnability)
|
| 132 |
3. What to do first and why
|
| 133 |
4. What the other side will do and how to counter it
|
| 134 |
5. What to watch out for
|
| 135 |
+
Be specific. Cite sections and procedures. Give a real plan.""",
|
| 136 |
|
| 137 |
+
"explanation": """Explain the legal concept clearly.
|
| 138 |
+
Start with plain language meaning.
|
| 139 |
+
Then apply to this specific situation.
|
| 140 |
+
Use analogy if it helps.
|
| 141 |
+
End with practical implication for user.""",
|
| 142 |
+
|
| 143 |
+
"observation": """Share a key observation the user may not have noticed.
|
| 144 |
+
Frame as insight: "The thing that stands out here is..."
|
| 145 |
+
Should reveal opportunity or flag risk.""",
|
| 146 |
+
|
| 147 |
+
"reassurance": """Acknowledge difficulty briefly.
|
| 148 |
+
Immediately establish that options exist.
|
| 149 |
+
Give one concrete thing that shows this isn't hopeless.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
Then move forward."""
|
| 151 |
}
|
| 152 |
|
|
|
|
| 153 |
STAGE_MAP = {
|
| 154 |
+
"intake": """First message or user just described situation.
|
|
|
|
| 155 |
Priority: Make them feel heard. Show you've grasped the key issue.
|
| 156 |
+
Approach: Brief reflection + one targeted question OR immediate reassurance if urgent.
|
| 157 |
+
Do NOT launch into full legal analysis yet — you need more facts.""",
|
| 158 |
+
|
| 159 |
+
"understanding": """Still gathering critical facts.
|
| 160 |
+
Priority: Get the one fact that most changes the strategy.
|
| 161 |
+
Ask ONE surgical question. Explain briefly why it matters.
|
| 162 |
+
Do not ask multiple questions. Do not give full strategy yet.""",
|
| 163 |
+
|
| 164 |
+
"analysis": """Enough facts for partial analysis.
|
| 165 |
+
Priority: Share what you're finding. Keep conversation moving.
|
| 166 |
+
Tell them what legal issues you see, what routes exist.
|
|
|
|
|
|
|
| 167 |
Can ask a clarifying question but lead with a finding.""",
|
| 168 |
|
| 169 |
+
"strategy": """Full picture established. Time to deliver.
|
|
|
|
| 170 |
Priority: Give them a real plan they can act on today.
|
| 171 |
+
Full strategic response — routes ranked by winnability, what to do first, what to watch out for.
|
| 172 |
This response should feel like what a senior advocate delivers in a paid consultation.""",
|
| 173 |
|
| 174 |
+
"followup": """User asking follow-up on something already discussed.
|
|
|
|
| 175 |
Priority: Answer directly and specifically. No need to re-establish context.
|
|
|
|
| 176 |
Keep it tight — they already have the background."""
|
| 177 |
}
|
| 178 |
|
| 179 |
|
| 180 |
def build_prompt(analysis: dict) -> str:
|
| 181 |
+
tone = analysis.get("tone", "casual")
|
| 182 |
+
fmt = analysis.get("format_requested", "none")
|
| 183 |
+
action = analysis.get("action_needed", "advice")
|
| 184 |
+
stage = analysis.get("stage", "understanding")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
return f"""{BASE_PERSONALITY}
|
| 187 |
|
| 188 |
── CURRENT TURN CONTEXT ──────────────────────────────────
|
| 189 |
|
| 190 |
CONVERSATION STAGE: {stage.upper()}
|
| 191 |
+
{STAGE_MAP.get(stage, STAGE_MAP["understanding"])}
|
| 192 |
|
| 193 |
USER TONE DETECTED: {tone.upper()}
|
| 194 |
+
{TONE_MAP.get(tone, TONE_MAP["casual"])}
|
| 195 |
|
| 196 |
RESPONSE TYPE NEEDED: {action.upper()}
|
| 197 |
+
{ACTION_MAP.get(action, ACTION_MAP["advice"])}
|
| 198 |
|
| 199 |
OUTPUT FORMAT: {fmt.upper()}
|
| 200 |
+
{FORMAT_MAP.get(fmt, FORMAT_MAP["none"])}
|
| 201 |
|
| 202 |
── END CONTEXT ───────────────────────────────────────────"""
|
| 203 |
|
| 204 |
|
| 205 |
+
# ── Pass 1 Analysis Prompt ────────────────────────────────
|
| 206 |
+
ANALYSIS_PROMPT = """You are the analytical layer for a legal assistant. Analyse the user message and conversation state, then output ONLY a valid JSON dict.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
+
Output this exact structure:
|
| 209 |
|
| 210 |
{
|
| 211 |
"tone": "panicked|analytical|aggressive|casual|defeated",
|
|
|
|
| 213 |
"subject": "brief description of main legal subject",
|
| 214 |
"action_needed": "question|reflection|partial_finding|advice|strategy|explanation|observation|reassurance",
|
| 215 |
"urgency": "immediate|medium|low",
|
| 216 |
+
"hypotheses": [
|
| 217 |
+
{"claim": "legal hypothesis 1", "confidence": "high|medium|low", "evidence": ["evidence supporting this"]},
|
| 218 |
+
{"claim": "legal hypothesis 2", "confidence": "high|medium|low", "evidence": []}
|
| 219 |
+
],
|
| 220 |
+
"facts_extracted": {
|
| 221 |
+
"parties": ["person or organisation mentioned"],
|
| 222 |
+
"events": ["what happened"],
|
| 223 |
+
"documents": ["evidence or documents mentioned"],
|
| 224 |
+
"amounts": ["money figures mentioned"],
|
| 225 |
+
"locations": ["places mentioned"],
|
| 226 |
+
"disputes": ["core dispute described"],
|
| 227 |
+
"timeline_events": ["event with approximate time if mentioned"]
|
| 228 |
+
},
|
| 229 |
+
"facts_missing": ["critical fact 1 that would change strategy", "critical fact 2"],
|
| 230 |
"stage": "intake|understanding|analysis|strategy|followup",
|
| 231 |
"last_response_type": "question|reflection|partial_finding|advice|strategy|explanation|observation|reassurance|none",
|
| 232 |
+
"updated_summary": "3-4 line compressed summary of ENTIRE conversation including this new message. Must capture all key facts, legal issues identified, and current stage.",
|
| 233 |
+
"search_queries": ["specific legal question for FAISS search 1", "specific legal question 2", "specific legal question 3"],
|
| 234 |
+
"should_interpret_context": true,
|
| 235 |
+
"format_decision": "prose|numbered|bullets|table|mixed — choose based on content type of this specific response"
|
| 236 |
}
|
| 237 |
|
| 238 |
Rules:
|
| 239 |
- If last_response_type was "question", action_needed CANNOT be "question"
|
| 240 |
+
- hypotheses must include non-obvious legal angles not just obvious ones
|
| 241 |
+
- facts_extracted must capture ALL facts mentioned even if implied
|
| 242 |
+
- search_queries must be specific legal questions optimised for semantic search — not generic terms
|
| 243 |
+
- updated_summary must be a complete brief of everything known so far
|
| 244 |
+
- should_interpret_context: true if agent should reflect its understanding back to user (useful every 3-4 turns)
|
| 245 |
+
- format_decision: choose the format that best fits what this specific response needs to communicate
|
| 246 |
- Output ONLY the JSON. No explanation. No preamble. No markdown fences."""
|
src/verify.py
CHANGED
|
@@ -1,18 +1,31 @@
|
|
| 1 |
"""
|
| 2 |
Citation verification module.
|
| 3 |
-
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
import re
|
| 11 |
import unicodedata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
def _normalise(text: str) -> str:
|
| 15 |
-
"""Lowercase, strip punctuation, collapse whitespace."""
|
| 16 |
text = text.lower()
|
| 17 |
text = unicodedata.normalize("NFKD", text)
|
| 18 |
text = re.sub(r"[^\w\s]", " ", text)
|
|
@@ -20,53 +33,141 @@ def _normalise(text: str) -> str:
|
|
| 20 |
return text
|
| 21 |
|
| 22 |
|
| 23 |
-
def _extract_quotes(text: str) -> list
|
| 24 |
-
"""Extract
|
|
|
|
|
|
|
|
|
|
| 25 |
patterns = [
|
| 26 |
-
r'"([^"]{
|
| 27 |
-
r'\u201c([^\u201d]{
|
| 28 |
-
r"'([^']{10,})'", # single quotes
|
| 29 |
]
|
| 30 |
-
quotes = []
|
| 31 |
for pattern in patterns:
|
| 32 |
found = re.findall(pattern, text)
|
| 33 |
quotes.extend(found)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
return quotes
|
| 35 |
|
| 36 |
|
| 37 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
-
Check
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
Returns:
|
| 42 |
(verified: bool, unverified_quotes: list[str])
|
| 43 |
|
| 44 |
Logic:
|
| 45 |
-
- Extract
|
| 46 |
-
- If no
|
| 47 |
-
- For each
|
| 48 |
-
- If ALL
|
| 49 |
-
- If ANY
|
| 50 |
"""
|
|
|
|
|
|
|
|
|
|
| 51 |
quotes = _extract_quotes(answer)
|
| 52 |
|
| 53 |
if not quotes:
|
| 54 |
return True, []
|
| 55 |
|
| 56 |
-
# Build normalised context corpus
|
| 57 |
-
all_context_text = " ".join(
|
| 58 |
-
_normalise(ctx.get("text", "") or ctx.get("excerpt", ""))
|
| 59 |
-
for ctx in contexts
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
unverified = []
|
| 63 |
for quote in quotes:
|
| 64 |
-
|
| 65 |
-
# Skip very short normalised quotes — likely artifacts
|
| 66 |
-
if len(normalised_quote) < 8:
|
| 67 |
continue
|
| 68 |
-
if
|
| 69 |
-
unverified.append(quote)
|
| 70 |
|
| 71 |
if unverified:
|
| 72 |
return False, unverified
|
|
|
|
| 1 |
"""
|
| 2 |
Citation verification module.
|
| 3 |
+
Uses semantic similarity (MiniLM cosine) instead of exact substring matching.
|
| 4 |
|
| 5 |
+
Why: LLMs paraphrase retrieved text rather than quoting verbatim.
|
| 6 |
+
Exact matching almost always returns Unverified even when the answer
|
| 7 |
+
is fully grounded in the retrieved sources.
|
| 8 |
+
|
| 9 |
+
Threshold: cosine similarity > 0.72 = verified.
|
| 10 |
+
Same MiniLM model already loaded in memory — no extra cost.
|
| 11 |
+
|
| 12 |
+
Documented limitation: semantic similarity can pass hallucinations
|
| 13 |
+
that are topically similar to retrieved text but factually different.
|
| 14 |
+
This is a known tradeoff vs exact matching.
|
| 15 |
"""
|
| 16 |
|
| 17 |
import re
|
| 18 |
import unicodedata
|
| 19 |
+
import logging
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# ── Similarity threshold ──────────────────────────────────
|
| 25 |
+
SIMILARITY_THRESHOLD = 0.72 # cosine similarity — tunable
|
| 26 |
|
| 27 |
|
| 28 |
def _normalise(text: str) -> str:
|
|
|
|
| 29 |
text = text.lower()
|
| 30 |
text = unicodedata.normalize("NFKD", text)
|
| 31 |
text = re.sub(r"[^\w\s]", " ", text)
|
|
|
|
| 33 |
return text
|
| 34 |
|
| 35 |
|
| 36 |
+
def _extract_quotes(text: str) -> list:
|
| 37 |
+
"""Extract quoted phrases and key sentences from answer."""
|
| 38 |
+
quotes = []
|
| 39 |
+
|
| 40 |
+
# Extract explicitly quoted phrases
|
| 41 |
patterns = [
|
| 42 |
+
r'"([^"]{15,})"',
|
| 43 |
+
r'\u201c([^\u201d]{15,})\u201d',
|
|
|
|
| 44 |
]
|
|
|
|
| 45 |
for pattern in patterns:
|
| 46 |
found = re.findall(pattern, text)
|
| 47 |
quotes.extend(found)
|
| 48 |
+
|
| 49 |
+
# If no explicit quotes, extract key sentences for verification
|
| 50 |
+
if not quotes:
|
| 51 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 52 |
+
# Take sentences that make specific legal claims
|
| 53 |
+
for s in sentences:
|
| 54 |
+
s = s.strip()
|
| 55 |
+
# Sentences with section numbers, case citations, or specific claims
|
| 56 |
+
if (len(s) > 40 and
|
| 57 |
+
any(indicator in s.lower() for indicator in [
|
| 58 |
+
"section", "act", "ipc", "crpc", "court held",
|
| 59 |
+
"judgment", "article", "rule", "according to",
|
| 60 |
+
"as per", "under", "punishable", "imprisonment"
|
| 61 |
+
])):
|
| 62 |
+
quotes.append(s)
|
| 63 |
+
if len(quotes) >= 3: # cap at 3 sentences
|
| 64 |
+
break
|
| 65 |
+
|
| 66 |
return quotes
|
| 67 |
|
| 68 |
|
| 69 |
+
def _get_embedder():
|
| 70 |
+
"""Get the already-loaded embedder — no double loading."""
|
| 71 |
+
try:
|
| 72 |
+
from src.retrieval import _embedder as embedder
|
| 73 |
+
return embedder
|
| 74 |
+
except ImportError:
|
| 75 |
+
pass
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
from src.embed import _model as embedder
|
| 79 |
+
return embedder
|
| 80 |
+
except ImportError:
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
# Last resort — import from retrieval module globals
|
| 85 |
+
import src.retrieval as retrieval_module
|
| 86 |
+
if hasattr(retrieval_module, '_embedder'):
|
| 87 |
+
return retrieval_module._embedder
|
| 88 |
+
if hasattr(retrieval_module, 'embedder'):
|
| 89 |
+
return retrieval_module.embedder
|
| 90 |
+
except Exception:
|
| 91 |
+
pass
|
| 92 |
+
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
| 97 |
+
"""Cosine similarity between two vectors."""
|
| 98 |
+
norm_a = np.linalg.norm(a)
|
| 99 |
+
norm_b = np.linalg.norm(b)
|
| 100 |
+
if norm_a == 0 or norm_b == 0:
|
| 101 |
+
return 0.0
|
| 102 |
+
return float(np.dot(a, b) / (norm_a * norm_b))
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _semantic_verify(quote: str, contexts: list) -> bool:
|
| 106 |
"""
|
| 107 |
+
Check if quote is semantically grounded in any context chunk.
|
| 108 |
+
Returns True if cosine similarity > threshold with any chunk.
|
| 109 |
+
"""
|
| 110 |
+
embedder = _get_embedder()
|
| 111 |
+
if embedder is None:
|
| 112 |
+
# Fallback to exact matching if embedder unavailable
|
| 113 |
+
all_text = " ".join(_normalise(c.get("text", "")) for c in contexts)
|
| 114 |
+
return _normalise(quote) in all_text
|
| 115 |
+
|
| 116 |
+
try:
|
| 117 |
+
# Embed the quote
|
| 118 |
+
quote_embedding = embedder.encode([quote], show_progress_bar=False)[0]
|
| 119 |
+
|
| 120 |
+
# Check against each context chunk
|
| 121 |
+
for ctx in contexts:
|
| 122 |
+
ctx_text = ctx.get("text", "") or ctx.get("expanded_context", "")
|
| 123 |
+
if not ctx_text or len(ctx_text.strip()) < 10:
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
# Use cached embedding if available, else compute
|
| 127 |
+
ctx_embedding = embedder.encode([ctx_text[:512]], show_progress_bar=False)[0]
|
| 128 |
+
similarity = _cosine_similarity(quote_embedding, ctx_embedding)
|
| 129 |
+
|
| 130 |
+
if similarity >= SIMILARITY_THRESHOLD:
|
| 131 |
+
return True
|
| 132 |
+
|
| 133 |
+
return False
|
| 134 |
+
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.warning(f"Semantic verification failed: {e}, falling back to exact match")
|
| 137 |
+
all_text = " ".join(_normalise(c.get("text", "")) for c in contexts)
|
| 138 |
+
return _normalise(quote) in all_text
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def verify_citations(answer: str, contexts: list) -> tuple:
|
| 142 |
+
"""
|
| 143 |
+
Verify whether answer claims are grounded in retrieved contexts.
|
| 144 |
+
|
| 145 |
+
Uses semantic similarity (cosine > 0.72) instead of exact matching.
|
| 146 |
|
| 147 |
Returns:
|
| 148 |
(verified: bool, unverified_quotes: list[str])
|
| 149 |
|
| 150 |
Logic:
|
| 151 |
+
- Extract quoted phrases and key legal claim sentences
|
| 152 |
+
- If no verifiable claims: return (True, [])
|
| 153 |
+
- For each claim: check semantic similarity against all context chunks
|
| 154 |
+
- If ALL claims verified: (True, [])
|
| 155 |
+
- If ANY claim unverified: (False, [list of unverified claims])
|
| 156 |
"""
|
| 157 |
+
if not contexts:
|
| 158 |
+
return False, []
|
| 159 |
+
|
| 160 |
quotes = _extract_quotes(answer)
|
| 161 |
|
| 162 |
if not quotes:
|
| 163 |
return True, []
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
unverified = []
|
| 166 |
for quote in quotes:
|
| 167 |
+
if len(quote.strip()) < 15:
|
|
|
|
|
|
|
| 168 |
continue
|
| 169 |
+
if not _semantic_verify(quote, contexts):
|
| 170 |
+
unverified.append(quote[:100] + "..." if len(quote) > 100 else quote)
|
| 171 |
|
| 172 |
if unverified:
|
| 173 |
return False, unverified
|