Spaces:

omarkashif
/

test

Sleeping

File size: 6,201 Bytes

f97322b
ed14dc4
6640849
 
 
 
 
59a9179
f97322b
 
 
 
8d16824
ed14dc4
f97322b
 
 
8ca6217
f97322b
 
f6d9e3b
f97322b
1bd789c
f97322b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6d9e3b
f97322b
b832b0a
 
f97322b
 
 
 
 
1bd789c
f97322b
 
cf2f7c8
f97322b
 
 
 
 
 
 
8d16824
 
 
 
 
 
 
 
 
 
 
 
 
f97322b
f6d9e3b
f97322b
f6d9e3b
 
 
408354f
 
8d16824
 
 
 
 
f6d9e3b
408354f
8d16824
f6d9e3b
408354f
8d16824
f6d9e3b
408354f
 
f6d9e3b
 
 
408354f
f6d9e3b
 
8d16824
 
113896e
 
8d16824
 
 
 
 
 
 
113896e
408354f
f97322b
f6d9e3b
8d28ad2
f6d9e3b
8d28ad2
f97322b
 
f6d9e3b
f97322b
b832b0a
 
f97322b
 
 
 
 
dc064f8
113896e
f6d9e3b
 
113896e
 
cf2f7c8
f6d9e3b
f97322b
f6d9e3b
 
 
 
 
 
 
 
 
 
 
f97322b
 
dc064f8
113896e
f6d9e3b
f97322b
 
 
 
 
 
 
 
 
 
 
cbb87db
f97322b
1bd789c
 
cbb87db
f97322b
 
 
8b3e0c0
cbb87db

import os

os.environ.setdefault("HF_HOME", "/tmp/huggingface")
os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/huggingface")
os.environ.setdefault("HF_HUB_CACHE", "/tmp/huggingface")
os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", "/tmp/huggingface/st_models")

import streamlit as st
import openai
from collections import deque
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import re

# Setup (exact hardcoded keys you provided)
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("legal-ai")
model = SentenceTransformer('all-mpnet-base-v2')
chat_history = deque(maxlen=10)  # last 5 pairs = 10 messages
ll_model = 'gpt-4o-mini'

st.title("AI Legal Assistant ⚖️")

if "history" not in st.session_state:
    st.session_state.history = deque(maxlen=10)

def get_rewritten_query(user_query):
    hist = list(st.session_state.history)[-4:]
    hist_text = "\n".join(f"{m['role']}: {m['content']}" for m in hist)
    messages = [
        {"role": "system", "content":
         "You are a legal assistant that rewrites user queries into clear, context-aware queries for vector DB lookup. If its already clear then dont rewite"},
        {"role": "user", "content":
         f"History:\n{hist_text}\n\nNew query:\n{user_query}\n\n"
         "Rewrite if needed for clarity/search purposes. Otherwise, repeat exactly."}
    ]
    try:
        resp = client.chat.completions.create(
            model=ll_model,
            messages=messages,
            temperature=0.1,
            max_tokens=400
        )
        rewritten = resp.choices[0].message.content.strip()
    except Exception as e:
        st.error(f"Rewrite error: {e}")
        rewritten = user_query
    # st.session_state.history.append({"role": "assistant", "content": f"🔁 Rewritten query: {rewritten}"})
    return rewritten

def retrieve_documents(query, top_k=10):
    emb = model.encode(query).tolist()
    try:
        return index.query(vector=emb, top_k=top_k, include_metadata=True)['matches']
    except Exception as e:
        st.error(f"Retrieve error: {e}")
        return []


def clean_chunk_id(cid: str) -> str:
    """Beautify chunk_id by replacing underscores/dashes with spaces and capitalizing words."""
    # Remove any trailing '_chunk_xxx' stuff
    cid = re.sub(r'_chunk.*$', '', cid)
    # Replace _ and - with spaces
    cid = cid.replace("_", " ").replace("-", " ")
    # Capitalize each word
    cid = " ".join(word.capitalize() for word in cid.split())
    return cid



def generate_response(user_query, docs):
    # --- Collect context ---
    context = "\n\n---\n\n".join(d['metadata']['text'] for d in docs)

    # --- Build human-friendly sources + mapping ---
    source_links = {}
    for d in docs:
        meta = d['metadata']
        src = meta.get("source", "unknown").lower()
        cid = meta.get("chunk_id", "")
        text_preview = " ".join(meta.get("text", "").split()[:30])

        if src in ["constitution"]:
            display_name = f"Constitution ({clean_chunk_id(cid)})"

        elif src in ["fbr_ordinance", "ordinance", "tax_ordinance"]:
            display_name = f"Tax Ordinance ({clean_chunk_id(cid)})"

        elif src in ["case_law", "case", "tax_case"]:
            display_name = f"Case Law: {text_preview}..."

        else:
            display_name = f"{src.title()} ({clean_chunk_id(cid)})"

        source_links[display_name] = meta.get("text", "")

    # Deduplicate
    source_links = dict(sorted(source_links.items()))

    # --- System prompt ---
    messages = [
        {"role": "system", "content":
         "You are a helpful legal assistant. Use the provided context from documents to answer the user's question. "
         "At the end of your answer, write a single line starting with 'Source: ' followed by the sources used. "
         "Formatting rules:\n"
         "- For Constitution / Ordinances: show the clean chunk id, no underscores/dashes, capitalized words.\n"
         "- For Case law: ignore chunk id, instead show first ~30 words of the case text.\n"
         "- Do not use technical terms like 'chunk'. Present sources in a human-friendly way.\n"
         "If multiple are used, separate them with commas."}
    ]

    messages.extend(st.session_state.history)

    messages.append({"role": "user", "content": f"Context:\n{context}\n\n"
                   f"Sources:\n{', '.join(source_links.keys())}\n\n"
                   f"Question:\n{user_query}"})
    try:
        resp = client.chat.completions.create(
            model=ll_model,
            messages=messages,
            temperature=0.1,
            max_tokens=900
        )
        reply = resp.choices[0].message.content.strip()
    except Exception as e:
        st.error(f"Response error: {e}")
        reply = "Sorry, I encountered an error generating the answer."

    # Optional: force clean source line if LLM misses it
    if source_links:
        clean_sources = ", ".join(source_links.keys())
        if "Source:" not in reply:
            reply += f"\n\nSource: {clean_sources}"

    # Save reply into history
    st.session_state.history.append({"role": "assistant", "content": reply})

    # --- Render in Streamlit ---
    st.markdown(reply)

    # Add expandable sources
    if source_links:
        st.write("### Sources")
        for name, text in source_links.items():
            with st.expander(name):
                st.write(text)

    return reply




# Chat UI
with st.form("chat_input", clear_on_submit=True):
    user_input = st.text_input("You:", "")
    submit = st.form_submit_button("Send")

if submit and user_input:
    st.session_state.history.append({"role": "user", "content": user_input})
    rewritten = get_rewritten_query(user_input)
    docs = retrieve_documents(rewritten)
    assistant_reply = generate_response(rewritten, docs)

c = 0
# Display history
st.markdown("---")
for msg in reversed(st.session_state.history):
    c+=1
    if msg["role"] == "user":
        st.markdown(f"**You:** {msg['content']}")
    else:
        st.markdown(f"**Legal Assistant:** {msg['content']}")
    if c ^ 1: st.markdown("---")