Aya-Samir-Emam commited on
Commit
0016e0e
·
verified ·
1 Parent(s): 0aaa5da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -84
app.py CHANGED
@@ -3,129 +3,115 @@ import torch
3
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
4
  import PyPDF2
5
  from docx import Document
6
- from fpdf import FPDF
7
- import base64
8
 
9
- # --- 1. Page Configuration ---
10
- st.set_page_config(page_title="LexGuard Ultimate | AI Legal Auditor", page_icon="⚖️", layout="wide")
11
 
12
- # --- 2. Load Engines (Legal & Sentiment) ---
13
  @st.cache_resource
14
  def load_engines():
 
15
  tokenizer = AutoTokenizer.from_pretrained("marshmellow77/roberta-base-cuad")
16
  model = AutoModelForQuestionAnswering.from_pretrained("marshmellow77/roberta-base-cuad")
17
- risk_engine = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
18
- return tokenizer, model, risk_engine
 
19
 
20
- tokenizer, model, risk_engine = load_engines()
21
 
22
- # --- 3. Advanced Extraction & Analysis Logic ---
23
- def get_legal_clause(query, context):
 
24
  inputs = tokenizer(query, context, truncation="only_second", max_length=512, stride=256,
25
- return_overflowing_tokens=True, padding="max_length", return_tensors="pt")
26
- best_ans, max_score = "", -float('inf')
 
 
 
 
27
  for i in range(len(inputs["input_ids"])):
28
  chunk = {"input_ids": inputs["input_ids"][i:i+1], "attention_mask": inputs["attention_mask"][i:i+1]}
29
  with torch.no_grad():
30
  outputs = model(**chunk)
31
- score = torch.max(outputs.start_logits) + torch.max(outputs.end_logits)
32
- if score > max_score:
33
- max_score = score
34
- start, end = torch.argmax(outputs.start_logits), torch.argmax(outputs.end_logits) + 1
35
- best_ans = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][i][start:end]))
36
-
37
- clean_ans = best_ans.replace("<s>", "").replace("</s>", "").strip()
38
- return clean_ans if len(clean_ans) > 15 else None
39
 
40
- # --- 4. PDF Report Generator ---
41
- def create_pdf(results):
42
- pdf = FPDF()
43
- pdf.add_page()
44
- pdf.set_font("Arial", 'B', 16)
45
- pdf.cell(200, 10, txt="LexGuard AI - Contract Audit Report", ln=True, align='C')
46
- pdf.set_font("Arial", size=10)
47
- pdf.cell(200, 10, txt="Generated by Aya Samir | AI Legal Platform", ln=True, align='C')
48
- pdf.ln(10)
49
-
50
- for title, data in results.items():
51
- pdf.set_font("Arial", 'B', 12)
52
- pdf.cell(200, 10, txt=f"Clause: {title}", ln=True)
53
- pdf.set_font("Arial", size=10)
54
- pdf.multi_cell(0, 5, txt=f"Status: {data['status']}\nContent: {data['content']}\n", border=1)
55
- pdf.ln(5)
56
- return pdf.output(dest='S').encode('latin-1')
57
 
58
- # --- 5. Sidebar & File Handling ---
59
- with st.sidebar:
60
- st.image("https://img.icons8.com/fluency/96/law.png", width=70)
61
- st.title("LexGuard Control")
62
- uploaded_file = st.file_uploader("Upload PDF or DOCX Contract", type=["pdf", "docx"])
63
- st.markdown("---")
64
- st.caption("Standard: UK/International Law Compliance")
65
 
66
- # --- 6. Main Dashboard UI ---
67
- st.title("⚖️ LexGuard Ultimate: Deep Contractual Audit")
68
- st.markdown("Automated Gap Analysis & Risk Assessment Engine")
 
 
 
 
 
69
 
70
  if uploaded_file:
71
- # Processing Text
72
  if uploaded_file.type == "application/pdf":
73
  raw_text = "".join([p.extract_text() for p in PyPDF2.PdfReader(uploaded_file).pages])
74
  else:
75
  raw_text = "\n".join([p.text for p in Document(uploaded_file).paragraphs])
76
 
77
- if st.button("🔍 Run Full Legal Audit"):
78
- audit_results = {}
 
 
79
  audit_plan = {
80
- "Governing Law": "What is the governing law and jurisdiction?",
81
- "Termination Rights": "What are the notice periods and conditions for termination?",
82
- "Limitation of Liability": "What is the maximum liability cap or exclusion of damages?",
83
- "Indemnification": "What are the indemnification obligations and losses?",
84
- "Confidentiality": "What are the non-disclosure obligations and their duration?",
85
- "Force Majeure": "What events excuse performance under the contract?",
86
- "Intellectual Property": "Who owns the intellectual property and work products?"
 
87
  }
88
 
89
- st.subheader("🚩 Audit Dashboard")
90
  progress_bar = st.progress(0)
91
 
92
  for idx, (title, query) in enumerate(audit_plan.items()):
93
- clause = get_legal_clause(query, raw_text)
94
- status = "Standard/Detected"
95
 
96
- with st.expander(f"📌 {title}", expanded=True):
97
  if clause:
98
- sentiment = risk_engine(clause[:512])[0]
99
- # Risk Logic (Negative Sentiment or Specific Keywords)
100
- is_risky = sentiment['label'] == 'NEGATIVE' or any(word in clause.lower() for word in ["limit", "exclude", "waive"])
 
101
 
102
  if is_risky:
103
- status = "🔴 HIGH RISK / AGGRESSIVE"
104
- st.error(status)
105
  else:
106
- status = "🟢 LOW RISK / STANDARD"
107
- st.success(status)
108
 
109
- st.write("**Extracted Text:**")
110
  st.code(clause, language="text")
111
  else:
112
- status = "⚠️ CRITICAL OMISSION / NOT FOUND"
113
- st.warning(status)
114
- st.info(f"Legal Insight: Missing {title} clauses increase litigation risks under English Law.")
115
-
116
- audit_results[title] = {"status": status, "content": clause if clause else "OMITTED"}
117
 
118
  progress_bar.progress((idx + 1) / len(audit_plan))
119
-
120
- # PDF Download Option
121
- pdf_data = create_pdf(audit_results)
122
- b64 = base64.b64encode(pdf_data).decode()
123
- href = f'<a href="data:application/octet-stream;base64,{b64}" download="LexGuard_Audit_Report.pdf" style="text-decoration:none;"><button style="width:100%; border-radius:5px; background-color:#4F46E5; color:white; padding:10px; border:none; cursor:pointer;">📥 Download Full Audit Report (PDF)</button></a>'
124
- st.markdown(href, unsafe_allow_html=True)
125
-
126
  else:
127
- st.info("👋 Welcome! Please upload a contract in the sidebar to start the deep analysis.")
128
 
129
- # --- 7. Footer ---
130
  st.markdown("---")
131
- st.caption("Developed by Aya Samir | AI, Data & Legal Consultant | Master's Researcher in Law")
 
3
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
4
  import PyPDF2
5
  from docx import Document
 
 
6
 
7
+ # --- 1. إعدادات الصفحة ---
8
+ st.set_page_config(page_title="LexGuard Precision", page_icon="⚖️", layout="wide")
9
 
10
+ # --- 2. تحميل المحركات الذكية ---
11
  @st.cache_resource
12
  def load_engines():
13
+ # موديل استخراج النصوص القانونية (CUAD)
14
  tokenizer = AutoTokenizer.from_pretrained("marshmellow77/roberta-base-cuad")
15
  model = AutoModelForQuestionAnswering.from_pretrained("marshmellow77/roberta-base-cuad")
16
+ # موديل تحليل المخاطر
17
+ risk_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
18
+ return tokenizer, model, risk_analyzer
19
 
20
+ tokenizer, model, risk_analyzer = load_engines()
21
 
22
+ # --- 3. منطق الاستخراج الدقيق ومنع التكرار ---
23
+ def get_precise_clause(query, context):
24
+ # استخدام Stride أكبر لضمان عدم ضياع السياق بين النوافذ
25
  inputs = tokenizer(query, context, truncation="only_second", max_length=512, stride=256,
26
+ return_overflowing_tokens=True, padding="max_length", return_tensors="pt")
27
+
28
+ best_answer = ""
29
+ max_score = -float('inf')
30
+ threshold = 2.5 # عتبة الثقة لمنع الموديل من تخمين إجابات خاطئة
31
+
32
  for i in range(len(inputs["input_ids"])):
33
  chunk = {"input_ids": inputs["input_ids"][i:i+1], "attention_mask": inputs["attention_mask"][i:i+1]}
34
  with torch.no_grad():
35
  outputs = model(**chunk)
36
+
37
+ # حساب قوة الإجابة (Confidence Score)
38
+ start_logits = outputs.start_logits
39
+ end_logits = outputs.end_logits
40
+ current_score = torch.max(start_logits) + torch.max(end_logits)
 
 
 
41
 
42
+ if current_score > threshold and current_score > max_score:
43
+ max_score = current_score
44
+ start_idx = torch.argmax(start_logits)
45
+ end_idx = torch.argmax(end_logits) + 1
46
+ best_answer = tokenizer.convert_tokens_to_string(
47
+ tokenizer.convert_ids_to_tokens(inputs["input_ids"][i][start_idx:end_idx])
48
+ )
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # تنظيف النص من الرموز التقنية
51
+ clean_ans = best_answer.replace("<s>", "").replace("</s>", "").strip()
52
+ # التأكد أن النص ليس قصيراً جداً أو مجرد كلمات متقاطعة
53
+ return clean_ans if len(clean_ans) > 25 else None
 
 
 
54
 
55
+ # --- 4. واجهة المستخدم ---
56
+ st.title("⚖️ LexGuard Pro: Comprehensive Legal Audit")
57
+ st.markdown("### نظام ال��دقيق القانوني الآلي وتحليل الثغرات")
58
+
59
+ with st.sidebar:
60
+ st.header("📥 إدخال العقد")
61
+ uploaded_file = st.file_uploader("ارفع ملف العقد (PDF/DOCX)", type=["pdf", "docx"])
62
+ st.info("النظام مهيأ لتحليل العقود وفق معايير القانون الإنجليزي والدولي.")
63
 
64
  if uploaded_file:
65
+ # قراءة النص من الملف
66
  if uploaded_file.type == "application/pdf":
67
  raw_text = "".join([p.extract_text() for p in PyPDF2.PdfReader(uploaded_file).pages])
68
  else:
69
  raw_text = "\n".join([p.text for p in Document(uploaded_file).paragraphs])
70
 
71
+ if st.button("🚀 ابدأ التدقيق الشامل الآن"):
72
+ st.divider()
73
+
74
+ # خطة التدقيق القانوني (أسئلة دقيقة جداً للموديل)
75
  audit_plan = {
76
+ "Governing Law": "Which state or country's law governs this agreement and where is the jurisdiction?",
77
+ "Termination for Convenience": "What is the notice period for termination without cause?",
78
+ "Termination for Cause": "Under what conditions can the contract be terminated for breach?",
79
+ "Limitation of Liability": "What is the monetary cap or limit on the provider's liability?",
80
+ "Confidentiality Obligations": "What are the restrictions on disclosing trade secrets and for how long?",
81
+ "Indemnification": "Which party is responsible for defending third-party legal claims?",
82
+ "Force Majeure": "What unexpected events excuse a party from performing their duties?",
83
+ "Intellectual Property": "Who owns the copyright and ownership of the software and work product?"
84
  }
85
 
 
86
  progress_bar = st.progress(0)
87
 
88
  for idx, (title, query) in enumerate(audit_plan.items()):
89
+ clause = get_precise_clause(query, raw_text)
 
90
 
91
+ with st.expander(f"📌 بند: {title}", expanded=True):
92
  if clause:
93
+ # تحليل المخاطر (Sentiment + Keyword Check)
94
+ sentiment = risk_analyzer(clause[:512])[0]
95
+ risk_keywords = ["limit", "exclude", "sole discretion", "waive", "immediate"]
96
+ is_risky = sentiment['label'] == 'NEGATIVE' or any(word in clause.lower() for word in risk_keywords)
97
 
98
  if is_risky:
99
+ st.error("🔴 تنبيه مخاطر: صياغة هذا البند قد تكون تقييدية أو مجحفة.")
 
100
  else:
101
+ st.success("🟢 بند قياسي: النص يتبع القواعد العامة المتعارف عليها.")
 
102
 
103
+ st.write("**النص المستخرج من العقد:**")
104
  st.code(clause, language="text")
105
  else:
106
+ # تحليل الفجوات (Gap Analysis)
107
+ st.warning(f"⚠️ بند مفقود: لم يتم العثور على نص صريح يتعلق بـ ({title}).")
108
+ st.info(f"نصيحة قانونية: غياب هذا البند في العقود الدولية قد يؤدي إلى نزاعات قضائية معقدة.")
 
 
109
 
110
  progress_bar.progress((idx + 1) / len(audit_plan))
111
+
112
+ st.balloons()
 
 
 
 
 
113
  else:
114
+ st.info("يرجى رفع ملف العقد من القائمة الجانبية لبدء عملية الفحص الآلي.")
115
 
 
116
  st.markdown("---")
117
+ st.caption("Aya Samir | AI & Legal Consultant | Master's Researcher in Law")