|
|
import logging |
|
|
|
|
|
MAX_INPUT_CHARS = 6000 |
|
|
|
|
|
def chunk_text(text, max_len=700): |
|
|
words = text.split() |
|
|
chunks = [] |
|
|
current = [] |
|
|
|
|
|
for word in words: |
|
|
current.append(word) |
|
|
if len(" ".join(current)) >= max_len: |
|
|
chunks.append(" ".join(current)) |
|
|
current = [] |
|
|
|
|
|
if current: |
|
|
chunks.append(" ".join(current)) |
|
|
|
|
|
return chunks[:3] |
|
|
|
|
|
def get_summary(text): |
|
|
logging.info("=== AutoTLDR Incoming Request ===") |
|
|
logging.info(f"Raw input (first 500 chars):\n{text[:500]}") |
|
|
logging.info(f"Total input length: {len(text)} characters") |
|
|
|
|
|
if len(text) > MAX_INPUT_CHARS: |
|
|
logging.warning("Input too long! Aborting.") |
|
|
raise ValueError("Text too long to summarize. Please try a shorter page.") |
|
|
|
|
|
from transformers import pipeline |
|
|
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum") |
|
|
|
|
|
summaries = [] |
|
|
chunks = chunk_text(text) |
|
|
|
|
|
for i, chunk in enumerate(chunks): |
|
|
logging.info(f"Summarizing chunk {i+1}/{len(chunks)} (length: {len(chunk)})") |
|
|
try: |
|
|
result = summarizer(chunk, max_length=120, min_length=20, do_sample=False) |
|
|
summaries.append(result[0]['summary_text']) |
|
|
logging.info(f"Chunk {i+1} summary: {result[0]['summary_text']}") |
|
|
except Exception as e: |
|
|
logging.exception(f"Error summarizing chunk {i+1}") |
|
|
summaries.append("...") |
|
|
|
|
|
final_summary = "\n".join(summaries) |
|
|
logging.info("=== Final Summary Output ===") |
|
|
logging.info(final_summary) |
|
|
|
|
|
return final_summary |
|
|
|