Spaces:

Aranwer
/

LegalAssistantChatbot

Sleeping

App Files Files Community

Aranwer commited on Apr 13, 2025

Commit

904c6a6

verified ·

1 Parent(s): d26beb1

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -95

app.py CHANGED Viewed

@@ -1,113 +1,66 @@
 import gradio as gr
-from datasets import load_dataset
-from sentence_transformers import SentenceTransformer
 import faiss
-import numpy as np
 from transformers import pipeline
-# Load dataset
-dataset = load_dataset("lex_glue", "scotus")
-corpus = [doc['text'] for doc in dataset['train'].select(range(200))]  # just 200 to keep it light
-# Embedding model
-embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-corpus_embeddings = embedder.encode(corpus, convert_to_numpy=True)
-# Build FAISS index
 dimension = corpus_embeddings.shape[1]
 index = faiss.IndexFlatL2(dimension)
-index.add(corpus_embeddings)
-# Text generation model
-gen_pipeline = pipeline("text2text-generation", model="facebook/bart-large-cnn")
-# RAG-like query function
-def rag_query(user_question):
-    # Encode the user question
-    question_embedding = embedder.encode([user_question])
-    k = 3  # top 3 documents
-    if index.ntotal < k:
-        k = index.ntotal  # Adjust if there are fewer documents than requested
-    # Perform the search in the FAISS index
-    _, indices = index.search(np.array(question_embedding), k=k)
-    # Ensure indices are valid (within range of the corpus)
-    valid_indices = [i for i in indices[0] if i < len(corpus)]
-    if len(valid_indices) == 0:
-        return "Sorry, no relevant documents were found."
-    # Extract relevant context from the corpus based on valid indices
-    context = " ".join([corpus[i] for i in valid_indices])
-    # Prepare the prompt and generate the response
-    prompt = f"Question: {user_question}\nContext: {context}\nAnswer:"
-    result = gen_pipeline(prompt, max_length=250, do_sample=False)[0]['generated_text']
-    return result
-# Gradio UI
-def chatbot_interface(query):
-    return rag_query(query)
-# Styling for the interface
-css = """
-    .gradio-container {
-        background-color: #f0f4f8;
-        font-family: Arial, sans-serif;
-    }
-    .gradio-input {
-        background-color: #ffffff;
-        border-radius: 5px;
-        border: 1px solid #d1d1d1;
-        font-size: 16px;
-        padding: 10px;
-    }
-    .gradio-button {
-        background-color: #4CAF50;
-        color: white;
-        border-radius: 5px;
-        border: none;
-        padding: 10px 20px;
-        font-size: 16px;
-    }
-    .gradio-button:hover {
-        background-color: #45a049;
-    }
-    .gradio-output {
-        background-color: #ffffff;
-        border-radius: 5px;
-        padding: 15px;
-        font-size: 16px;
-        border: 1px solid #d1d1d1;
-    }
-    .gradio-title {
-        font-size: 28px;
-        font-weight: bold;
-        color: #333333;
-        text-align: center;
-        margin-bottom: 20px;
-    }
-    .gradio-description {
-        font-size: 16px;
-        color: #666666;
-        text-align: center;
-        margin-bottom: 30px;
-    }
-"""
-# Create the Gradio interface
 iface = gr.Interface(
-    fn=chatbot_interface,
-    inputs="text",
-    outputs="text",
     title="🧑‍⚖️ Legal Assistant Chatbot",
-    description="Ask legal questions based on case data (LexGLUE - SCOTUS subset). Get answers derived from relevant court case texts.",
-    theme="compact",
-    css=css
 )
-# Launch the Gradio interface
-iface.launch()

+import zipfile
+import os
+import pandas as pd
+import numpy as np
+import ast
 import gradio as gr
 import faiss
+from sentence_transformers import SentenceTransformer
 from transformers import pipeline
+# Unzip the dataset if not already done
+zip_path = "lexglue-legal-nlp-benchmark-dataset.zip"
+extract_dir = "lexglue_data"
+if not os.path.exists(extract_dir):
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(extract_dir)
+# Load CSV from extracted folder
+df = pd.read_csv(os.path.join(extract_dir, "case_hold_test.csv"))
+df = df[['context', 'endings', 'label']]
+df['endings'] = df['endings'].apply(ast.literal_eval)
+# Prepare corpus: concatenate context with each ending
+corpus = []
+for idx, row in df.iterrows():
+    context = row['context']
+    for ending in row['endings']:
+        corpus.append(f"{context.strip()} {ending.strip()}")
+# Load Sentence Transformer and encode the corpus
+embedder = SentenceTransformer('all-MiniLM-L6-v2')
+corpus_embeddings = embedder.encode(corpus, show_progress_bar=True)
+# Create FAISS index
 dimension = corpus_embeddings.shape[1]
 index = faiss.IndexFlatL2(dimension)
+index.add(np.array(corpus_embeddings))
+# Load text generation pipeline
+generator = pipeline("text-generation", model="gpt2")
+# Query Function
+def legal_assistant_query(query):
+    query_embedding = embedder.encode([query])
+    D, I = index.search(np.array(query_embedding), k=5)
+    retrieved_docs = [corpus[i] for i in I[0]]
+    context_combined = "\n\n".join(retrieved_docs)
+    prompt = f"Given the following legal references, answer the question:\n\n{context_combined}\n\nQuestion: {query}\nAnswer:"
+    result = generator(prompt, max_new_tokens=200, do_sample=True)[0]['generated_text']
+    return result.split("Answer:")[-1].strip()
+# Gradio Interface
 iface = gr.Interface(
+    fn=legal_assistant_query,
+    inputs=gr.Textbox(lines=2, placeholder="Ask a legal question..."),
+    outputs=gr.Textbox(label="Legal Response"),
     title="🧑‍⚖️ Legal Assistant Chatbot",
+    description="Ask any legal question and get context-based case references using the LexGLUE dataset."
 )
+iface.launch()