File size: 3,718 Bytes
e810b6d 3705a5d e810b6d 9a4e8af 8ea81be e810b6d ea4d61f 3705a5d 8ea81be 3705a5d 9a4e8af 2d14e21 8ea81be e810b6d 9d00612 e810b6d ea4d61f 8ea81be e810b6d ea4d61f e810b6d ea4d61f e810b6d 8ea81be 3705a5d 8ea81be e810b6d 3705a5d 8ea81be 3705a5d 8ea81be 9a4e8af 8ea81be 9a4e8af 8ea81be 9a4e8af 8ea81be 9a4e8af ea4d61f 8ea81be 3705a5d 8ea81be e810b6d ea4d61f e810b6d ea4d61f e810b6d ea4d61f e810b6d 8ea81be 3705a5d e810b6d ea4d61f e810b6d 8ea81be e810b6d 8b6e435 e810b6d 8ea81be e810b6d 8ea81be e810b6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import os
import json
import uuid
import gradio as gr
import chromadb
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
from huggingface_hub import CommitScheduler
from chromadb.errors import NotFoundError
from openai import OpenAI
# Load embedding model
embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
# Load ChromaDB client
chroma_client = chromadb.PersistentClient(path="./clause_index")
try:
collection = chroma_client.get_collection("legal_clauses")
except NotFoundError:
collection = None
# Setup OpenAI/Hugging Face client
client = OpenAI(
base_url="https://router.huggingface.co/featherless-ai/v1",
api_key=os.getenv("HF_TOKEN"),
)
# Prompt template
system_message = """You are a legal AI assistant trained on contract clause examples from the CUAD dataset.
If no clauses are retrieved from the database, infer the answer using your understanding of common contractual standards. and report that no clause retrieved"""
user_template = """
### Context:
{context}
### Question:
{question}
"""
# Setup logging
log_file = Path("logs/") / f"query_{uuid.uuid4()}.json"
log_file.parent.mkdir(exist_ok=True)
scheduler = CommitScheduler(
repo_id="legal-rag-output",
repo_type="dataset",
folder_path=log_file.parent,
path_in_repo="logs",
every=2
)
# Main QA function
def predict(question):
try:
# Encode query
query_embedding = embed_model.encode([question], normalize_embeddings=True)[0]
# Default fallback context
context = "No relevant clauses were found in the database. Please answer using your legal understanding from the CUAD dataset."
# If collection exists, try retrieval
if collection:
try:
results = collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=3
)
documents = results["documents"][0]
metadatas = results["metadatas"][0]
if documents:
context = "\n\n".join(
f"[Clause Type: {m['clause_type']}] {doc}"
for doc, m in zip(documents, metadatas)
)
except Exception:
context = "Due to an internal retrieval issue, please answer based on your legal knowledge from CUAD dataset."
# Construct prompt
prompt = [
{"role": "system", "content": system_message},
{"role": "user", "content": user_template.format(context=context, question=question)}
]
# Generate response
stream = client.chat.completions.create(
model="mistralai/Mistral-7B-Instruct-v0.2",
messages=prompt,
temperature=0.4,
top_p=0.7,
stream=True
)
output = ""
for chunk in stream:
output += chunk.choices[0].delta.content or ""
except Exception as e:
output = f"An internal error occurred while generating the response: {str(e)}"
# Log to file
with scheduler.lock:
with log_file.open("a") as f:
f.write(json.dumps({
"question": question,
"context": context,
"response": output
}) + "\n")
return output
# Gradio UI
demo = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="Enter your legal question:", lines=4),
outputs=gr.Textbox(label="Answer"),
title="⚖️ GL_LegalMind",
description="Ask contract-related legal questions. Answers are based on retrieved clauses or inferred from CUAD knowledge."
)
demo.queue()
demo.launch()
|