ardhigagan commited on
Commit
01042a2
·
verified ·
1 Parent(s): e11524d

Upload 4 files

Browse files
Files changed (3) hide show
  1. src/analysis.py +80 -0
  2. src/ingestion.py +37 -0
  3. src/processing.py +15 -0
src/analysis.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import torch
3
+
4
+ # Check if GPU is available
5
+ device = 0 if torch.cuda.is_available() else -1
6
+ print(f"utilizing device: {'GPU' if device == 0 else 'CPU'}")
7
+
8
+ # 1. LOAD MODELS
9
+ print("Loading Summarization Model...")
10
+ # Force PyTorch framework with framework="pt"
11
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device, framework="pt")
12
+
13
+ print("Loading Risk Detection Model...")
14
+ risk_detector = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device, framework="pt")
15
+
16
+ def analyze_chunk(text_chunk):
17
+ """
18
+ Analyzes a single chunk. Returns a summary and A LIST of risks.
19
+ """
20
+ # A. SUMMARIZE
21
+ try:
22
+ summary_result = summarizer(text_chunk, max_length=150, min_length=30, do_sample=False)
23
+ summary = summary_result[0]['summary_text']
24
+ except Exception as e:
25
+ print(f"Summarization error: {e}")
26
+ summary = ""
27
+
28
+ # B. DETECT RISKS (MULTI-LABEL)
29
+ # The AI will now check for these 10 distinct legal traps + "Safe"
30
+ candidate_labels = [
31
+ "Financial Penalty",
32
+ "Privacy Violation",
33
+ "Non-Compete Restriction",
34
+ "Termination Without Cause",
35
+ "Intellectual Property Transfer",
36
+ "Mandatory Arbitration",
37
+ "Indemnification Obligation",
38
+ "Unilateral Amendment",
39
+ "Jurisdiction Waiver",
40
+ "Automatic Renewal",
41
+ "Safe Standard Clause"
42
+ ]
43
+
44
+ # multi_label=True allows multiple independent high scores
45
+ risk_result = risk_detector(text_chunk, candidate_labels, multi_label=True)
46
+
47
+ # Collect ALL risks above the threshold (50%)
48
+ detected_risks = []
49
+
50
+ for label, score in zip(risk_result['labels'], risk_result['scores']):
51
+ # If it's a risk label AND confidence is > 50%
52
+ if label != "Safe Standard Clause" and score > 0.50:
53
+ detected_risks.append({
54
+ "type": label,
55
+ "score": round(score, 2),
56
+ "text_snippet": text_chunk[:200] + "..." # Snippet for context
57
+ })
58
+
59
+ return summary, detected_risks
60
+
61
+ def analyze_document(chunks):
62
+ """
63
+ Orchestrates the analysis.
64
+ """
65
+ full_summary = []
66
+ all_risks = []
67
+
68
+ print(f"Starting analysis on {len(chunks)} chunks...")
69
+
70
+ for i, chunk in enumerate(chunks):
71
+ summary, risks = analyze_chunk(chunk)
72
+ full_summary.append(summary)
73
+
74
+ # Add all found risks to the master list
75
+ if risks:
76
+ all_risks.extend(risks)
77
+
78
+ final_executive_summary = " ".join(full_summary)
79
+
80
+ return final_executive_summary, all_risks
src/ingestion.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import pytesseract
3
+ from PIL import Image
4
+ import numpy as np
5
+ import cv2
6
+ from pdf2image import convert_from_bytes
7
+ import io
8
+
9
+ def clean_text(text):
10
+ if not text:
11
+ return ""
12
+ text = "\n".join([line.strip() for line in text.split("\n") if line.strip()])
13
+ return text
14
+
15
+ def extract_text_from_pdf(file_bytes):
16
+ text_content = ""
17
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
18
+ for page in pdf.pages:
19
+ extracted = page.extract_text()
20
+ if extracted:
21
+ text_content += extracted + "\n"
22
+ if len(text_content) < 50:
23
+ print("Digital extraction failed. Switching to OCR...")
24
+ text_content = ""
25
+ images = convert_from_bytes(file_bytes)
26
+
27
+ for img in images:
28
+ img_np = np.array(img)
29
+ gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
30
+ _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
31
+ page_text = pytesseract.image_to_string(thresh)
32
+ text_content += page_text + "\n"
33
+ return clean_text(text_content)
34
+
35
+ def extract_text_from_image(file_bytes):
36
+ image = Image.open(io.BytesIO(file_bytes))
37
+ return pytesseract.image_to_string(image)
src/processing.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+
3
+ def chunk_text(text, chunk_size=1000, chunk_overlap=200):
4
+ if not text:
5
+ return []
6
+
7
+ text_splitter = RecursiveCharacterTextSplitter(
8
+ chunk_size=chunk_size,
9
+ chunk_overlap=chunk_overlap,
10
+ separators=["\n\n", "\n", ".", " ", ""]
11
+ )
12
+
13
+ chunks = text_splitter.split_text(text)
14
+ print(f"Split document into {len(chunks)} chunks.")
15
+ return chunks