Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,129 +3,115 @@ import torch
|
|
| 3 |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
|
| 4 |
import PyPDF2
|
| 5 |
from docx import Document
|
| 6 |
-
from fpdf import FPDF
|
| 7 |
-
import base64
|
| 8 |
|
| 9 |
-
# --- 1.
|
| 10 |
-
st.set_page_config(page_title="LexGuard
|
| 11 |
|
| 12 |
-
# --- 2.
|
| 13 |
@st.cache_resource
|
| 14 |
def load_engines():
|
|
|
|
| 15 |
tokenizer = AutoTokenizer.from_pretrained("marshmellow77/roberta-base-cuad")
|
| 16 |
model = AutoModelForQuestionAnswering.from_pretrained("marshmellow77/roberta-base-cuad")
|
| 17 |
-
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
-
tokenizer, model,
|
| 21 |
|
| 22 |
-
# --- 3.
|
| 23 |
-
def
|
|
|
|
| 24 |
inputs = tokenizer(query, context, truncation="only_second", max_length=512, stride=256,
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
for i in range(len(inputs["input_ids"])):
|
| 28 |
chunk = {"input_ids": inputs["input_ids"][i:i+1], "attention_mask": inputs["attention_mask"][i:i+1]}
|
| 29 |
with torch.no_grad():
|
| 30 |
outputs = model(**chunk)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
clean_ans = best_ans.replace("<s>", "").replace("</s>", "").strip()
|
| 38 |
-
return clean_ans if len(clean_ans) > 15 else None
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
pdf.cell(200, 10, txt="Generated by Aya Samir | AI Legal Platform", ln=True, align='C')
|
| 48 |
-
pdf.ln(10)
|
| 49 |
-
|
| 50 |
-
for title, data in results.items():
|
| 51 |
-
pdf.set_font("Arial", 'B', 12)
|
| 52 |
-
pdf.cell(200, 10, txt=f"Clause: {title}", ln=True)
|
| 53 |
-
pdf.set_font("Arial", size=10)
|
| 54 |
-
pdf.multi_cell(0, 5, txt=f"Status: {data['status']}\nContent: {data['content']}\n", border=1)
|
| 55 |
-
pdf.ln(5)
|
| 56 |
-
return pdf.output(dest='S').encode('latin-1')
|
| 57 |
|
| 58 |
-
#
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
uploaded_file = st.file_uploader("Upload PDF or DOCX Contract", type=["pdf", "docx"])
|
| 63 |
-
st.markdown("---")
|
| 64 |
-
st.caption("Standard: UK/International Law Compliance")
|
| 65 |
|
| 66 |
-
# ---
|
| 67 |
-
st.title("⚖️ LexGuard
|
| 68 |
-
st.markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
if uploaded_file:
|
| 71 |
-
#
|
| 72 |
if uploaded_file.type == "application/pdf":
|
| 73 |
raw_text = "".join([p.extract_text() for p in PyPDF2.PdfReader(uploaded_file).pages])
|
| 74 |
else:
|
| 75 |
raw_text = "\n".join([p.text for p in Document(uploaded_file).paragraphs])
|
| 76 |
|
| 77 |
-
if st.button("
|
| 78 |
-
|
|
|
|
|
|
|
| 79 |
audit_plan = {
|
| 80 |
-
"Governing Law": "
|
| 81 |
-
"Termination
|
| 82 |
-
"
|
| 83 |
-
"
|
| 84 |
-
"Confidentiality": "What are the
|
| 85 |
-
"
|
| 86 |
-
"
|
|
|
|
| 87 |
}
|
| 88 |
|
| 89 |
-
st.subheader("🚩 Audit Dashboard")
|
| 90 |
progress_bar = st.progress(0)
|
| 91 |
|
| 92 |
for idx, (title, query) in enumerate(audit_plan.items()):
|
| 93 |
-
clause =
|
| 94 |
-
status = "Standard/Detected"
|
| 95 |
|
| 96 |
-
with st.expander(f"📌 {title}", expanded=True):
|
| 97 |
if clause:
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
| 101 |
|
| 102 |
if is_risky:
|
| 103 |
-
|
| 104 |
-
st.error(status)
|
| 105 |
else:
|
| 106 |
-
|
| 107 |
-
st.success(status)
|
| 108 |
|
| 109 |
-
st.write("**
|
| 110 |
st.code(clause, language="text")
|
| 111 |
else:
|
| 112 |
-
|
| 113 |
-
st.warning(
|
| 114 |
-
st.info(f"
|
| 115 |
-
|
| 116 |
-
audit_results[title] = {"status": status, "content": clause if clause else "OMITTED"}
|
| 117 |
|
| 118 |
progress_bar.progress((idx + 1) / len(audit_plan))
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
pdf_data = create_pdf(audit_results)
|
| 122 |
-
b64 = base64.b64encode(pdf_data).decode()
|
| 123 |
-
href = f'<a href="data:application/octet-stream;base64,{b64}" download="LexGuard_Audit_Report.pdf" style="text-decoration:none;"><button style="width:100%; border-radius:5px; background-color:#4F46E5; color:white; padding:10px; border:none; cursor:pointer;">📥 Download Full Audit Report (PDF)</button></a>'
|
| 124 |
-
st.markdown(href, unsafe_allow_html=True)
|
| 125 |
-
|
| 126 |
else:
|
| 127 |
-
st.info("
|
| 128 |
|
| 129 |
-
# --- 7. Footer ---
|
| 130 |
st.markdown("---")
|
| 131 |
-
st.caption("
|
|
|
|
| 3 |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
|
| 4 |
import PyPDF2
|
| 5 |
from docx import Document
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
# --- 1. إعدادات الصفحة ---
|
| 8 |
+
st.set_page_config(page_title="LexGuard Precision", page_icon="⚖️", layout="wide")
|
| 9 |
|
| 10 |
+
# --- 2. تحميل المحركات الذكية ---
|
| 11 |
@st.cache_resource
|
| 12 |
def load_engines():
|
| 13 |
+
# موديل استخراج النصوص القانونية (CUAD)
|
| 14 |
tokenizer = AutoTokenizer.from_pretrained("marshmellow77/roberta-base-cuad")
|
| 15 |
model = AutoModelForQuestionAnswering.from_pretrained("marshmellow77/roberta-base-cuad")
|
| 16 |
+
# موديل تحليل المخاطر
|
| 17 |
+
risk_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
|
| 18 |
+
return tokenizer, model, risk_analyzer
|
| 19 |
|
| 20 |
+
tokenizer, model, risk_analyzer = load_engines()
|
| 21 |
|
| 22 |
+
# --- 3. منطق الاستخراج الدقيق ومنع التكرار ---
|
| 23 |
+
def get_precise_clause(query, context):
|
| 24 |
+
# استخدام Stride أكبر لضمان عدم ضياع السياق بين النوافذ
|
| 25 |
inputs = tokenizer(query, context, truncation="only_second", max_length=512, stride=256,
|
| 26 |
+
return_overflowing_tokens=True, padding="max_length", return_tensors="pt")
|
| 27 |
+
|
| 28 |
+
best_answer = ""
|
| 29 |
+
max_score = -float('inf')
|
| 30 |
+
threshold = 2.5 # عتبة الثقة لمنع الموديل من تخمين إجابات خاطئة
|
| 31 |
+
|
| 32 |
for i in range(len(inputs["input_ids"])):
|
| 33 |
chunk = {"input_ids": inputs["input_ids"][i:i+1], "attention_mask": inputs["attention_mask"][i:i+1]}
|
| 34 |
with torch.no_grad():
|
| 35 |
outputs = model(**chunk)
|
| 36 |
+
|
| 37 |
+
# حساب قوة الإجابة (Confidence Score)
|
| 38 |
+
start_logits = outputs.start_logits
|
| 39 |
+
end_logits = outputs.end_logits
|
| 40 |
+
current_score = torch.max(start_logits) + torch.max(end_logits)
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
if current_score > threshold and current_score > max_score:
|
| 43 |
+
max_score = current_score
|
| 44 |
+
start_idx = torch.argmax(start_logits)
|
| 45 |
+
end_idx = torch.argmax(end_logits) + 1
|
| 46 |
+
best_answer = tokenizer.convert_tokens_to_string(
|
| 47 |
+
tokenizer.convert_ids_to_tokens(inputs["input_ids"][i][start_idx:end_idx])
|
| 48 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
# تنظيف النص من الرموز التقنية
|
| 51 |
+
clean_ans = best_answer.replace("<s>", "").replace("</s>", "").strip()
|
| 52 |
+
# التأكد أن النص ليس قصيراً جداً أو مجرد كلمات متقاطعة
|
| 53 |
+
return clean_ans if len(clean_ans) > 25 else None
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
# --- 4. واجهة المستخدم ---
|
| 56 |
+
st.title("⚖️ LexGuard Pro: Comprehensive Legal Audit")
|
| 57 |
+
st.markdown("### نظام ال��دقيق القانوني الآلي وتحليل الثغرات")
|
| 58 |
+
|
| 59 |
+
with st.sidebar:
|
| 60 |
+
st.header("📥 إدخال العقد")
|
| 61 |
+
uploaded_file = st.file_uploader("ارفع ملف العقد (PDF/DOCX)", type=["pdf", "docx"])
|
| 62 |
+
st.info("النظام مهيأ لتحليل العقود وفق معايير القانون الإنجليزي والدولي.")
|
| 63 |
|
| 64 |
if uploaded_file:
|
| 65 |
+
# قراءة النص من الملف
|
| 66 |
if uploaded_file.type == "application/pdf":
|
| 67 |
raw_text = "".join([p.extract_text() for p in PyPDF2.PdfReader(uploaded_file).pages])
|
| 68 |
else:
|
| 69 |
raw_text = "\n".join([p.text for p in Document(uploaded_file).paragraphs])
|
| 70 |
|
| 71 |
+
if st.button("🚀 ابدأ التدقيق الشامل الآن"):
|
| 72 |
+
st.divider()
|
| 73 |
+
|
| 74 |
+
# خطة التدقيق القانوني (أسئلة دقيقة جداً للموديل)
|
| 75 |
audit_plan = {
|
| 76 |
+
"Governing Law": "Which state or country's law governs this agreement and where is the jurisdiction?",
|
| 77 |
+
"Termination for Convenience": "What is the notice period for termination without cause?",
|
| 78 |
+
"Termination for Cause": "Under what conditions can the contract be terminated for breach?",
|
| 79 |
+
"Limitation of Liability": "What is the monetary cap or limit on the provider's liability?",
|
| 80 |
+
"Confidentiality Obligations": "What are the restrictions on disclosing trade secrets and for how long?",
|
| 81 |
+
"Indemnification": "Which party is responsible for defending third-party legal claims?",
|
| 82 |
+
"Force Majeure": "What unexpected events excuse a party from performing their duties?",
|
| 83 |
+
"Intellectual Property": "Who owns the copyright and ownership of the software and work product?"
|
| 84 |
}
|
| 85 |
|
|
|
|
| 86 |
progress_bar = st.progress(0)
|
| 87 |
|
| 88 |
for idx, (title, query) in enumerate(audit_plan.items()):
|
| 89 |
+
clause = get_precise_clause(query, raw_text)
|
|
|
|
| 90 |
|
| 91 |
+
with st.expander(f"📌 بند: {title}", expanded=True):
|
| 92 |
if clause:
|
| 93 |
+
# تحليل المخاطر (Sentiment + Keyword Check)
|
| 94 |
+
sentiment = risk_analyzer(clause[:512])[0]
|
| 95 |
+
risk_keywords = ["limit", "exclude", "sole discretion", "waive", "immediate"]
|
| 96 |
+
is_risky = sentiment['label'] == 'NEGATIVE' or any(word in clause.lower() for word in risk_keywords)
|
| 97 |
|
| 98 |
if is_risky:
|
| 99 |
+
st.error("🔴 تنبيه مخاطر: صياغة هذا البند قد تكون تقييدية أو مجحفة.")
|
|
|
|
| 100 |
else:
|
| 101 |
+
st.success("🟢 بند قياسي: النص يتبع القواعد العامة المتعارف عليها.")
|
|
|
|
| 102 |
|
| 103 |
+
st.write("**النص المستخرج من العقد:**")
|
| 104 |
st.code(clause, language="text")
|
| 105 |
else:
|
| 106 |
+
# تحليل الفجوات (Gap Analysis)
|
| 107 |
+
st.warning(f"⚠️ بند مفقود: لم يتم العثور على نص صريح يتعلق بـ ({title}).")
|
| 108 |
+
st.info(f"نصيحة قانونية: غياب هذا البند في العقود الدولية قد يؤدي إلى نزاعات قضائية معقدة.")
|
|
|
|
|
|
|
| 109 |
|
| 110 |
progress_bar.progress((idx + 1) / len(audit_plan))
|
| 111 |
+
|
| 112 |
+
st.balloons()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
else:
|
| 114 |
+
st.info("يرجى رفع ملف العقد من القائمة الجانبية لبدء عملية الفحص الآلي.")
|
| 115 |
|
|
|
|
| 116 |
st.markdown("---")
|
| 117 |
+
st.caption("Aya Samir | AI & Legal Consultant | Master's Researcher in Law")
|