Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
import io
|
| 3 |
import torch
|
| 4 |
import uvicorn
|
|
@@ -19,7 +21,7 @@ from pyngrok import ngrok
|
|
| 19 |
from threading import Thread
|
| 20 |
import time
|
| 21 |
import uuid
|
| 22 |
-
import subprocess #
|
| 23 |
|
| 24 |
# β
Ensure compatibility with Google Colab
|
| 25 |
try:
|
|
@@ -49,7 +51,7 @@ app.add_middleware(
|
|
| 49 |
|
| 50 |
# β
Initialize document storage
|
| 51 |
document_storage = {}
|
| 52 |
-
chat_history = [] #
|
| 53 |
|
| 54 |
# β
Function to store document context by task ID
|
| 55 |
def store_document_context(task_id, text):
|
|
@@ -68,26 +70,18 @@ def load_document_context(task_id):
|
|
| 68 |
|
| 69 |
def fine_tune_cuad_model():
|
| 70 |
"""
|
| 71 |
-
Fine tunes a
|
| 72 |
-
|
| 73 |
-
adjust training parameters as needed.
|
| 74 |
"""
|
| 75 |
from datasets import load_dataset
|
| 76 |
import numpy as np
|
| 77 |
-
|
| 78 |
-
from transformers import Trainer, TrainingArguments
|
| 79 |
-
from transformers import AutoModelForQuestionAnswering
|
| 80 |
|
| 81 |
print("β
Loading CUAD dataset for fine tuning...")
|
| 82 |
-
# Load the CUAD QA dataset (SQuAD-style) with custom code allowed
|
| 83 |
dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
|
| 84 |
|
| 85 |
-
# Use the train split with a larger subset for production fine tuning
|
| 86 |
if "train" in dataset:
|
| 87 |
-
# Select a larger subset for training, e.g., 1000 examples
|
| 88 |
train_dataset = dataset["train"].select(range(1000))
|
| 89 |
-
|
| 90 |
-
# For validation, you might select around 200 examples
|
| 91 |
if "validation" in dataset:
|
| 92 |
val_dataset = dataset["validation"].select(range(200))
|
| 93 |
else:
|
|
@@ -99,12 +93,10 @@ def fine_tune_cuad_model():
|
|
| 99 |
|
| 100 |
print("β
Preparing training features...")
|
| 101 |
|
| 102 |
-
# Load a QA model and its tokenizer. Here we use deepset/roberta-base-squad2.
|
| 103 |
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
|
| 104 |
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
|
| 105 |
|
| 106 |
def prepare_train_features(examples):
|
| 107 |
-
# Tokenize with question and context; use truncation only on the context.
|
| 108 |
tokenized_examples = tokenizer(
|
| 109 |
examples["question"],
|
| 110 |
examples["context"],
|
|
@@ -153,11 +145,9 @@ def fine_tune_cuad_model():
|
|
| 153 |
train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
|
| 154 |
val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
|
| 155 |
|
| 156 |
-
# Set format for PyTorch QA training
|
| 157 |
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
|
| 158 |
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
|
| 159 |
|
| 160 |
-
# For QA tasks, computing metrics can be more complex; here we skip metrics for brevity.
|
| 161 |
training_args = TrainingArguments(
|
| 162 |
output_dir="./fine_tuned_legal_qa",
|
| 163 |
evaluation_strategy="steps",
|
|
@@ -170,7 +160,7 @@ def fine_tune_cuad_model():
|
|
| 170 |
logging_steps=50,
|
| 171 |
save_steps=100,
|
| 172 |
load_best_model_at_end=True,
|
| 173 |
-
report_to=[] #
|
| 174 |
)
|
| 175 |
|
| 176 |
print("β
Starting fine tuning on CUAD QA dataset...")
|
|
@@ -203,8 +193,7 @@ try:
|
|
| 203 |
nlp = spacy.load("en_core_web_sm")
|
| 204 |
print("β
Loading NLP models...")
|
| 205 |
|
| 206 |
-
#
|
| 207 |
-
from transformers import AutoTokenizer
|
| 208 |
summarizer = pipeline(
|
| 209 |
"summarization",
|
| 210 |
model="nsi319/legal-pegasus",
|
|
@@ -213,14 +202,11 @@ try:
|
|
| 213 |
)
|
| 214 |
|
| 215 |
embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
|
| 216 |
-
ner_model = pipeline("ner", model="dslim/bert-base-NER",
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
device_map="auto" if torch.cuda.is_available() else "cpu")
|
| 222 |
-
|
| 223 |
-
# β
Load or Fine Tune CUAD QA Model
|
| 224 |
if os.path.exists("fine_tuned_legal_qa"):
|
| 225 |
print("β
Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
|
| 226 |
cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
|
|
@@ -239,7 +225,6 @@ except Exception as e:
|
|
| 239 |
raise RuntimeError(f"Error loading models: {str(e)}")
|
| 240 |
|
| 241 |
from transformers import pipeline
|
| 242 |
-
|
| 243 |
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
| 244 |
|
| 245 |
def legal_chatbot(user_input, context):
|
|
@@ -260,11 +245,10 @@ def extract_text_from_pdf(pdf_file):
|
|
| 260 |
raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
|
| 261 |
|
| 262 |
def process_video_to_text(video_file_path):
|
| 263 |
-
"""
|
| 264 |
try:
|
| 265 |
print(f"Processing video file at {video_file_path}")
|
| 266 |
temp_audio_path = os.path.join("temp", "extracted_audio.wav")
|
| 267 |
-
# Use ffmpeg command to extract audio from the video file
|
| 268 |
cmd = [
|
| 269 |
"ffmpeg", "-i", video_file_path, "-vn",
|
| 270 |
"-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
|
|
@@ -283,7 +267,7 @@ def process_video_to_text(video_file_path):
|
|
| 283 |
raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
|
| 284 |
|
| 285 |
def process_audio_to_text(audio_file_path):
|
| 286 |
-
"""
|
| 287 |
try:
|
| 288 |
print(f"Processing audio file at {audio_file_path}")
|
| 289 |
result = speech_to_text(audio_file_path)
|
|
@@ -429,7 +413,7 @@ def analyze_contract_clauses(text):
|
|
| 429 |
inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
|
| 430 |
with torch.no_grad():
|
| 431 |
outputs = cuad_model(**inputs)
|
| 432 |
-
predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
|
| 433 |
for idx, confidence in enumerate(predictions):
|
| 434 |
if confidence > 0.5 and idx < len(clause_types):
|
| 435 |
clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
|
|
|
|
| 1 |
import os
|
| 2 |
+
os.environ["TRANSFORMERS_NO_FAST"] = "1" # Force use of slow tokenizers
|
| 3 |
+
|
| 4 |
import io
|
| 5 |
import torch
|
| 6 |
import uvicorn
|
|
|
|
| 21 |
from threading import Thread
|
| 22 |
import time
|
| 23 |
import uuid
|
| 24 |
+
import subprocess # For running ffmpeg commands
|
| 25 |
|
| 26 |
# β
Ensure compatibility with Google Colab
|
| 27 |
try:
|
|
|
|
| 51 |
|
| 52 |
# β
Initialize document storage
|
| 53 |
document_storage = {}
|
| 54 |
+
chat_history = [] # Global chat history
|
| 55 |
|
| 56 |
# β
Function to store document context by task ID
|
| 57 |
def store_document_context(task_id, text):
|
|
|
|
| 70 |
|
| 71 |
def fine_tune_cuad_model():
|
| 72 |
"""
|
| 73 |
+
Fine tunes a QA model on the CUAD dataset for clause extraction.
|
| 74 |
+
This demo uses one epoch; adjust parameters as needed.
|
|
|
|
| 75 |
"""
|
| 76 |
from datasets import load_dataset
|
| 77 |
import numpy as np
|
| 78 |
+
from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering
|
|
|
|
|
|
|
| 79 |
|
| 80 |
print("β
Loading CUAD dataset for fine tuning...")
|
|
|
|
| 81 |
dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
|
| 82 |
|
|
|
|
| 83 |
if "train" in dataset:
|
|
|
|
| 84 |
train_dataset = dataset["train"].select(range(1000))
|
|
|
|
|
|
|
| 85 |
if "validation" in dataset:
|
| 86 |
val_dataset = dataset["validation"].select(range(200))
|
| 87 |
else:
|
|
|
|
| 93 |
|
| 94 |
print("β
Preparing training features...")
|
| 95 |
|
|
|
|
| 96 |
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
|
| 97 |
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
|
| 98 |
|
| 99 |
def prepare_train_features(examples):
|
|
|
|
| 100 |
tokenized_examples = tokenizer(
|
| 101 |
examples["question"],
|
| 102 |
examples["context"],
|
|
|
|
| 145 |
train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
|
| 146 |
val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
|
| 147 |
|
|
|
|
| 148 |
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
|
| 149 |
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
|
| 150 |
|
|
|
|
| 151 |
training_args = TrainingArguments(
|
| 152 |
output_dir="./fine_tuned_legal_qa",
|
| 153 |
evaluation_strategy="steps",
|
|
|
|
| 160 |
logging_steps=50,
|
| 161 |
save_steps=100,
|
| 162 |
load_best_model_at_end=True,
|
| 163 |
+
report_to=[] # Disable wandb logging
|
| 164 |
)
|
| 165 |
|
| 166 |
print("β
Starting fine tuning on CUAD QA dataset...")
|
|
|
|
| 193 |
nlp = spacy.load("en_core_web_sm")
|
| 194 |
print("β
Loading NLP models...")
|
| 195 |
|
| 196 |
+
# Initialize summarizer with a slow tokenizer
|
|
|
|
| 197 |
summarizer = pipeline(
|
| 198 |
"summarization",
|
| 199 |
model="nsi319/legal-pegasus",
|
|
|
|
| 202 |
)
|
| 203 |
|
| 204 |
embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
|
| 205 |
+
ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
|
| 206 |
+
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
|
| 207 |
+
device_map="auto" if torch.cuda.is_available() else "cpu")
|
| 208 |
+
|
| 209 |
+
# Load or fine tune CUAD QA model
|
|
|
|
|
|
|
|
|
|
| 210 |
if os.path.exists("fine_tuned_legal_qa"):
|
| 211 |
print("β
Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
|
| 212 |
cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
|
|
|
|
| 225 |
raise RuntimeError(f"Error loading models: {str(e)}")
|
| 226 |
|
| 227 |
from transformers import pipeline
|
|
|
|
| 228 |
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
| 229 |
|
| 230 |
def legal_chatbot(user_input, context):
|
|
|
|
| 245 |
raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
|
| 246 |
|
| 247 |
def process_video_to_text(video_file_path):
|
| 248 |
+
"""Extracts audio from video using ffmpeg and converts to text."""
|
| 249 |
try:
|
| 250 |
print(f"Processing video file at {video_file_path}")
|
| 251 |
temp_audio_path = os.path.join("temp", "extracted_audio.wav")
|
|
|
|
| 252 |
cmd = [
|
| 253 |
"ffmpeg", "-i", video_file_path, "-vn",
|
| 254 |
"-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
|
|
|
|
| 267 |
raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
|
| 268 |
|
| 269 |
def process_audio_to_text(audio_file_path):
|
| 270 |
+
"""Processes an audio file and converts it to text."""
|
| 271 |
try:
|
| 272 |
print(f"Processing audio file at {audio_file_path}")
|
| 273 |
result = speech_to_text(audio_file_path)
|
|
|
|
| 413 |
inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
|
| 414 |
with torch.no_grad():
|
| 415 |
outputs = cuad_model(**inputs)
|
| 416 |
+
predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
|
| 417 |
for idx, confidence in enumerate(predictions):
|
| 418 |
if confidence > 0.5 and idx < len(clause_types):
|
| 419 |
clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
|