Spaces:
Sleeping
Sleeping
File size: 5,818 Bytes
3cfeab7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from groq import Groq
import re
from nltk.tokenize import sent_tokenize
import nltk
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
except:
pass
def summarize_legal_document(text, max_sentences=5, groq_api_key=None, model_path=None):
"""
Summarize legal document text
Args:
text: Input text to summarize
max_sentences: Maximum number of sentences in summary
groq_api_key: Optional Groq API key for enhanced summarization
model_path: Optional custom model path
Returns:
Dictionary with summary and metadata
"""
if not text or not text.strip():
return {"error": "Empty text provided", "success": False}
max_sentences = max(3, min(max_sentences, 20))
# Initialize result
result = {
"original_length": len(text),
"word_count": len(text.split()),
"sentence_count": len(sent_tokenize(text)),
"success": False
}
try:
# Always generate extractive summary
extractive_summary = _extractive_summarize(text, max_sentences)
result["summary"] = extractive_summary
# Try Groq enhancement
if groq_api_key:
try:
groq_summary = _groq_summarize(text, max_sentences, groq_api_key)
if groq_summary:
result["summary"] = groq_summary
except Exception:
pass
# Calculate final metrics
final_summary = result.get("summary", "")
result["summary_length"] = len(final_summary)
result["compression_ratio"] = (
result["summary_length"] / result["original_length"]
if result["original_length"] > 0 else 0
)
result["success"] = True
except Exception as e:
result["error"] = str(e)
result["success"] = False
return result
def _extractive_summarize(text, max_sentences):
"""Extract key sentences based on legal document scoring"""
sentences = sent_tokenize(text)
if len(sentences) <= max_sentences:
return text
legal_keywords = [
'court', 'judge', 'plaintiff', 'defendant', 'appellant', 'respondent',
'held', 'ruled', 'decided', 'judgment', 'order', 'section', 'article',
'provision', 'law', 'legal', 'case', 'appeal', 'petition', 'writ',
'contract', 'agreement', 'liability', 'damages', 'evidence', 'witness',
'statute', 'regulation', 'finding', 'conclusion', 'reasoning'
]
sentence_scores = []
for i, sentence in enumerate(sentences):
if not sentence.strip():
continue
score = 0
sentence_lower = sentence.lower()
# Keyword scoring
for keyword in legal_keywords:
if keyword in sentence_lower:
score += 1
# Position scoring
if i == 0:
score += 3
elif i == len(sentences) - 1:
score += 2
elif i < len(sentences) * 0.2:
score += 1
# Length scoring
word_count = len(sentence.split())
if 15 <= word_count <= 40:
score += 2
elif 10 <= word_count <= 50:
score += 1
# Numbers and dates
if re.search(r'\b\d{4}\b|\b\d+\s*(percent|%|\$)', sentence):
score += 1
# Legal citations
if re.search(r'\d+\s+[A-Z][a-z]+\.?\s+\d+|\bv\.\s+[A-Z]', sentence):
score += 2
sentence_scores.append((score, i, sentence))
# Select top sentences
sentence_scores.sort(reverse=True, key=lambda x: x[0])
selected_sentences = sentence_scores[:max_sentences]
# Sort by original order
selected_sentences.sort(key=lambda x: x[1])
return ' '.join([sent[2] for sent in selected_sentences])
def _groq_summarize(text, max_sentences, api_key):
"""Enhanced summarization using Groq LLM"""
try:
client = Groq(api_key=api_key)
# Truncate if too long
if len(text) > 6000:
text = text[:6000] + "\n[...text truncated...]"
system_prompt = """You are an expert legal document summarizer. Create concise, accurate summaries that capture the most important information.
Guidelines:
1. Focus on key legal facts, holdings, and conclusions
2. Preserve important legal terminology and concepts
3. Maintain logical flow of legal reasoning
4. Include relevant case citations, statutes, or regulations
5. Be precise and avoid unnecessary elaboration"""
user_prompt = f"""Please summarize the following legal document in approximately {max_sentences} sentences:
{text}
Provide a clear, concise summary:"""
response = client.chat.completions.create(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
model="llama-3.1-8b-instant",
temperature=0.2,
max_tokens=800,
top_p=0.9
)
summary = response.choices[0].message.content.strip()
if summary and len(summary) > 20:
return summary
except Exception:
pass
return None
def _chunk_text(text, max_words):
"""Split text into chunks for processing"""
words = text.split()
chunks = []
for i in range(0, len(words), max_words):
chunk_words = words[i:i + max_words]
if chunk_words:
chunks.append(' '.join(chunk_words))
return chunks |