Spaces:

the-carnage
/

docurizer

Sleeping

the-carnage commited on Mar 3

Commit

216c20d

1 Parent(s): bd0a950

Fix repetition: return original text for very short inputs, use greedy decoding, enforce min < max length

Files changed (1) hide show

app.py CHANGED Viewed

@@ -42,10 +42,39 @@ def summarize_text(text, min_Len, max_Len):
     input_text = "summarize: " + text[:4000]
     inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
     input_token_count = inputs.input_ids.shape[1]
-    # Cap lengths to avoid repetition when input is shorter than requested output
-    effective_max = min(max_Len, max(input_token_count - 1, 10))
-    effective_min = min(min_Len, max(effective_max // 2, 5))
-    summary_ids = model.generate(inputs.input_ids, max_length=effective_max, min_length=effective_min, length_penalty=2.0, num_beams=4, early_stopping=True)
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 tab1, tab2, tab3 = st.tabs(["📝 Text", "🖼️ Image", "📄 PDF"])

     input_text = "summarize: " + text[:4000]
     inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
     input_token_count = inputs.input_ids.shape[1]
+    # For very short inputs, just return the original text
+    if input_token_count < 15:
+        return text.strip()
+    # Cap lengths to avoid repetition - max should not exceed input length
+    effective_max = min(max_Len, max(int(input_token_count * 0.6), 20))
+    effective_min = 5  # Minimum 5 tokens for a summary
+    # Ensure min < max
+    if effective_min >= effective_max:
+        effective_min = max(1, effective_max - 5)
+    # Use simpler generation for short inputs
+    if input_token_count < 50:
+        summary_ids = model.generate(
+            inputs.input_ids,
+            max_length=effective_max,
+            min_length=effective_min,
+            do_sample=False,  # Deterministic
+            num_beams=1,  # No beam search for short inputs
+            early_stopping=True
+        )
+    else:
+        summary_ids = model.generate(
+            inputs.input_ids,
+            max_length=effective_max,
+            min_length=effective_min,
+            length_penalty=2.0,
+            num_beams=4,
+            early_stopping=True
+        )
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 tab1, tab2, tab3 = st.tabs(["📝 Text", "🖼️ Image", "📄 PDF"])