fin-jack's picture
Create app.py
090d043 verified
import os
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fastapi import FastAPI
app = FastAPI()
def load_legal_data(directory):
"""
Load legal text data from a directory.
Args:
- directory (str): Path to the directory containing legal text files.
Returns:
- texts (list): List of text content loaded from files in the directory.
"""
texts = []
for file_name in os.listdir(directory):
file_path = os.path.join(directory, file_name)
with open(file_path, 'r') as file:
content = file.read()
texts.append(content)
return texts
def preprocess_text(texts, nlp):
"""
Preprocess and vectorize text using SpaCy and TF-IDF.
Args:
- texts (list): List of text documents.
- nlp (spacy.Language): SpaCy language model.
Returns:
- tfidf_matrix (scipy.sparse.csr_matrix): TF-IDF matrix of vectorized text.
- vectorizer (sklearn.feature_extraction.text.TfidfVectorizer): TF-IDF vectorizer.
"""
processed_texts = [" ".join([token.lemma_ for token in nlp(text) if not token.is_stop]) for text in texts]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_texts)
return tfidf_matrix, vectorizer
def get_most_relevant_text(query_vector, tfidf_matrix, texts):
"""
Retrieve the most relevant text based on cosine similarity.
Args:
- query_vector (scipy.sparse.csr_matrix): Vectorized query.
- tfidf_matrix (scipy.sparse.csr_matrix): TF-IDF matrix of vectorized text.
- texts (list): List of text documents.
Returns:
- most_relevant_text (str): Most relevant text document.
"""
similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
top_idx = similarities.argmax()
most_relevant_text = texts[top_idx]
return most_relevant_text
# Load legal data - Cases
cases_directory = '/kaggle/input/legalai/Object_casedocs/'
cases_texts = load_legal_data(cases_directory)
# Load legal data - Statutes
statutes_directory = '/kaggle/input/legalai/Object_statutes/'
statutes_texts = load_legal_data(statutes_directory)
# Load SpaCy language model
nlp = spacy.load("en_core_web_sm")
# Preprocess and vectorize text for cases
tfidf_matrix_cases, vectorizer_cases = preprocess_text(cases_texts, nlp)
# Preprocess and vectorize text for statutes
tfidf_matrix_statutes, vectorizer_statutes = preprocess_text(statutes_texts, nlp)
# API Endpoint to handle legal queries
@app.post("/analyze/")
async def analyze_legal_query(query: str):
# Vectorize user query
query_vector_cases = vectorizer_cases.transform([query])
query_vector_statutes = vectorizer_statutes.transform([query])
# Retrieve the most relevant case and statute
relevant_case = get_most_relevant_text(query_vector_cases, tfidf_matrix_cases, cases_texts)
relevant_statute = get_most_relevant_text(query_vector_statutes, tfidf_matrix_statutes, statutes_texts)
# Extract statutes from the relevant case
doc = nlp(relevant_case)
statutes = [ent.text for ent in doc.ents if ent.label_ == "LAW"]
# Summarize the relevant case
case_summary = "\n".join([sent.text for sent in doc.sents])
# Generate Legal Document
legal_document = f"Legal Document - User Query: {query}\n\n"
legal_document += f"Case Summary:\n{case_summary}\n\n"
legal_document += "Relevant Statute:\n"
legal_document += f"{relevant_statute}\n"
legal_document += "\nGuidance for the User:\n"
legal_document += "To defend your friend in court, focus on presenting evidence that supports their actions were in self-defense.\n"
legal_document += "Emphasize any mitigating circumstances and demonstrate their lack of intent to harm.\n"
legal_document += "Consult with a qualified legal professional to build a strong defense strategy."
return {"query": query, "legal_document": legal_document}