Spaces:

Aditi132
/

Legal-text-simplifier

Sleeping

App Files Files Community

Aditi132 commited on Jan 23

Commit

3f9ad47

verified ·

1 Parent(s): 1b5fd52

Upload 2 files

Browse files

Files changed (2) hide show

app.py +143 -0
train.py +105 -0

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import gradio as gr
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+MODEL_NAME = "t5-small"
+print("Loading model...")
+tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, legacy=False)
+model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+model.eval()
+print(f"Model loaded on {device}!")
+def simplify_legal_text(legal_text, max_length=512, num_beams=4):
+    if not legal_text or not legal_text.strip():
+        return "Please enter some legal text to simplify."
+    if len(legal_text) > 5000:
+        return "Text too long! Please keep input under 5,000 characters."
+    try:
+        input_text = f"summarize: {legal_text}"
+        # ✅ FIXED: Use tokenizer as callable
+        encoded = tokenizer(
+            input_text,
+            max_length=1024,
+            truncation=True,
+            return_tensors="pt"
+        )
+        inputs = encoded.input_ids.to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                inputs,
+                max_length=max_length,
+                num_beams=num_beams,
+                early_stopping=True,
+                do_sample=False,
+                repetition_penalty=2.5,
+                length_penalty=1.0,
+                no_repeat_ngram_size=3
+            )
+        simplified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return simplified_text
+    except Exception as e:
+        return f"Error: {str(e)}. Please try again with shorter text."
+# Create Gradio interface
+with gr.Blocks(title="Legal Text Simplifier", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # ⚖️ Legal Text Simplifier
+        Transform complex legal language into simple, easy-to-understand text.
+        **How to use:**
+        1. Paste your legal text in the input box
+        2. Adjust settings if needed (optional)
+        3. Click "Simplify" to get your simplified version
+        **Tips:**
+        - Works best with paragraphs or short documents
+        - For very long texts, break them into smaller sections
+        - The model uses AI to preserve meaning while simplifying language
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            legal_input = gr.Textbox(
+                label="📝 Legal Text (Paste your complex legal text here)",
+                placeholder="Enter legal text to simplify...",
+                lines=10,
+                value="The party of the first part hereby agrees to indemnify and hold harmless the party of the second part from any and all claims, damages, losses, costs, and expenses..."
+            )
+            with gr.Row():
+                simplify_btn = gr.Button("✨ Simplify Text", variant="primary", size="lg")
+                clear_btn = gr.Button("🗑️ Clear", size="lg")
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Advanced Settings")
+            max_length = gr.Slider(
+                minimum=100,
+                maximum=1000,
+                value=512,
+                step=50,
+                label="Max Output Length",
+                info="Longer = more detailed, but may be slower"
+            )
+            num_beams = gr.Slider(
+                minimum=1,
+                maximum=8,
+                value=4,
+                step=1,
+                label="Quality (Beam Search)",
+                info="Higher = better quality, slower generation"
+            )
+    simplified_output = gr.Textbox(
+        label="✨ Simplified Text",
+        lines=10,
+        interactive=False,
+        placeholder="Your simplified text will appear here..."
+    )
+    gr.Markdown(
+        """
+        ---
+        ### 💡 Example
+        **Input:** "The party of the first part hereby agrees to indemnify and hold harmless..."
+        **Output:** "The first party agrees to protect the second party from any claims or losses..."
+        ---
+        *Powered by T5 Transformer Model | Deployed for free on Hugging Face Spaces*
+        """
+    )
+    # Connect the function to the interface
+    simplify_btn.click(
+        fn=simplify_legal_text,
+        inputs=[legal_input, max_length, num_beams],
+        outputs=simplified_output
+    )
+    clear_btn.click(
+        fn=lambda: ("", ""),
+        outputs=[legal_input, simplified_output]
+    )
+# ... [Gradio UI code unchanged] ...
+if __name__ == "__main__":
+    # ✅ FIXED: No server_name/port for Spaces compatibility
+    demo.launch()

train.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+import torch
+from datasets import load_dataset
+from transformers import (
+    T5ForConditionalGeneration,
+    T5Tokenizer,
+    Seq2SeqTrainingArguments,
+    Seq2SeqTrainer,
+    DataCollatorForSeq2Seq
+)
+# --- Configuration ---
+MODEL_NAME = "t5-small"
+OUTPUT_DIR = "./model_output"
+MAX_INPUT_LENGTH = 1024
+MAX_TARGET_LENGTH = 128
+# We can increase batch size slightly if using GPU, but monitoring RAM is crucial
+BATCH_SIZE = 8
+EPOCHS = 3
+def main():
+    # Check for GPU
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    if device == "cuda":
+        print(f"GPU Name: {torch.cuda.get_device_name(0)}")
+        print(f"Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
+    else:
+        print("WARNING: No GPU detected. Training will be slow on CPU.")
+    print(f"Loading model: {MODEL_NAME}...")
+    try:
+        tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, legacy=False)
+        model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
+        model.to(device) # Move model to GPU immediately
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return
+    # --- Load Dataset ---
+    print("Loading 'billsum' dataset...")
+    # Using 'ca_test' for a quick cycle
+    dataset = load_dataset("billsum", split="ca_test")
+    # Let's train on slightly more data now that we have a GPU
+    # Splitting the 1200 ca_test examples
+    dataset = dataset.train_test_split(test_size=0.1)
+    train_dataset = dataset["train"] # Uses ~1000 examples
+    eval_dataset = dataset["test"]   # Uses ~100 examples
+    print(f"Training on {len(train_dataset)} examples...")
+    # --- Preprocessing ---
+    prefix = "summarize: "
+    def preprocess_function(examples):
+        inputs = [prefix + doc for doc in examples["text"]]
+        model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
+        labels = tokenizer(text_target=examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True)
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+    print("Tokenizing data...")
+    tokenized_train = train_dataset.map(preprocess_function, batched=True)
+    tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
+    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+    # --- Training Args ---
+    training_args = Seq2SeqTrainingArguments(
+    output_dir=OUTPUT_DIR,
+    eval_strategy="epoch",          # ✅ Correct for transformers >= 4.40
+    learning_rate=2e-5,
+    per_device_train_batch_size=BATCH_SIZE,
+    per_device_eval_batch_size=BATCH_SIZE,
+    weight_decay=0.01,
+    save_total_limit=1,
+    num_train_epochs=EPOCHS,
+    predict_with_generate=True,
+    fp16=(device == "cuda"),       # Mixed precision on GPU
+    dataloader_num_workers=0,      # Safe for Windows
+    logging_steps=10,
+)
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_eval,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+    print("Starting training...")
+    trainer.train()
+    print("Saving model...")
+    trainer.save_model(OUTPUT_DIR)
+    tokenizer.save_pretrained(OUTPUT_DIR)
+    print(f"Model saved to {OUTPUT_DIR}")
+if __name__ == "__main__":
+    main()