Spaces:

ardhigagan
/

LegalLens-AI

Sleeping

App Files Files Community

ardhigagan commited on Jan 24

Commit

01042a2

verified ·

1 Parent(s): e11524d

Upload 4 files

Browse files

Files changed (3) hide show

src/analysis.py +80 -0
src/ingestion.py +37 -0
src/processing.py +15 -0

src/analysis.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from transformers import pipeline
+import torch
+# Check if GPU is available
+device = 0 if torch.cuda.is_available() else -1
+print(f"utilizing device: {'GPU' if device == 0 else 'CPU'}")
+# 1. LOAD MODELS
+print("Loading Summarization Model...")
+# Force PyTorch framework with framework="pt"
+summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device, framework="pt")
+print("Loading Risk Detection Model...")
+risk_detector = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device, framework="pt")
+def analyze_chunk(text_chunk):
+    """
+    Analyzes a single chunk. Returns a summary and A LIST of risks.
+    """
+    # A. SUMMARIZE
+    try:
+        summary_result = summarizer(text_chunk, max_length=150, min_length=30, do_sample=False)
+        summary = summary_result[0]['summary_text']
+    except Exception as e:
+        print(f"Summarization error: {e}")
+        summary = ""
+   # B. DETECT RISKS (MULTI-LABEL)
+    # The AI will now check for these 10 distinct legal traps + "Safe"
+    candidate_labels = [
+        "Financial Penalty",
+        "Privacy Violation",
+        "Non-Compete Restriction",
+        "Termination Without Cause",
+        "Intellectual Property Transfer",
+        "Mandatory Arbitration",
+        "Indemnification Obligation",
+        "Unilateral Amendment",
+        "Jurisdiction Waiver",
+        "Automatic Renewal",
+        "Safe Standard Clause"
+    ]
+    # multi_label=True allows multiple independent high scores
+    risk_result = risk_detector(text_chunk, candidate_labels, multi_label=True)
+    # Collect ALL risks above the threshold (50%)
+    detected_risks = []
+    for label, score in zip(risk_result['labels'], risk_result['scores']):
+        # If it's a risk label AND confidence is > 50%
+        if label != "Safe Standard Clause" and score > 0.50:
+            detected_risks.append({
+                "type": label,
+                "score": round(score, 2),
+                "text_snippet": text_chunk[:200] + "..." # Snippet for context
+            })
+    return summary, detected_risks
+def analyze_document(chunks):
+    """
+    Orchestrates the analysis.
+    """
+    full_summary = []
+    all_risks = []
+    print(f"Starting analysis on {len(chunks)} chunks...")
+    for i, chunk in enumerate(chunks):
+        summary, risks = analyze_chunk(chunk)
+        full_summary.append(summary)
+        # Add all found risks to the master list
+        if risks:
+            all_risks.extend(risks)
+    final_executive_summary = " ".join(full_summary)
+    return final_executive_summary, all_risks

src/ingestion.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import pdfplumber
+import pytesseract
+from PIL import Image
+import numpy as np
+import cv2
+from pdf2image import convert_from_bytes
+import io
+def clean_text(text):
+    if not text:
+        return ""
+    text = "\n".join([line.strip() for line in text.split("\n") if line.strip()])
+    return text
+def extract_text_from_pdf(file_bytes):
+    text_content = ""
+    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+        for page in pdf.pages:
+            extracted = page.extract_text()
+            if extracted:
+                text_content += extracted + "\n"
+    if len(text_content) < 50:
+        print("Digital extraction failed. Switching to OCR...")
+        text_content = ""
+        images = convert_from_bytes(file_bytes)
+        for img in images:
+            img_np = np.array(img)
+            gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
+            _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
+            page_text = pytesseract.image_to_string(thresh)
+            text_content += page_text + "\n"
+    return clean_text(text_content)
+def extract_text_from_image(file_bytes):
+    image = Image.open(io.BytesIO(file_bytes))
+    return pytesseract.image_to_string(image)

src/processing.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from langchain_text_splitters import RecursiveCharacterTextSplitter
+def chunk_text(text, chunk_size=1000, chunk_overlap=200):
+    if not text:
+        return []
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        separators=["\n\n", "\n", ".", " ", ""]
+    )
+    chunks = text_splitter.split_text(text)
+    print(f"Split document into {len(chunks)} chunks.")
+    return chunks