Spaces:
Sleeping
Sleeping
Commit ·
d1754e4
1
Parent(s): a67ba36
truncates
Browse files
app.py
CHANGED
|
@@ -29,7 +29,8 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
| 29 |
|
| 30 |
# Token constraints
|
| 31 |
MAX_MODEL_TOKENS = 1024
|
| 32 |
-
SAFE_CHUNK_SIZE = 600 #
|
|
|
|
| 33 |
|
| 34 |
# Pydantic schemas
|
| 35 |
class SummarizationItem(BaseModel):
|
|
@@ -79,6 +80,14 @@ def split_sentences(text: str, max_sentence_tokens: int = SAFE_CHUNK_SIZE) -> li
|
|
| 79 |
|
| 80 |
return split_results
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
# Chunking based on token length
|
| 83 |
def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
|
| 84 |
sentences = split_sentences(text)
|
|
@@ -121,7 +130,7 @@ async def summarize_batch(request: BatchSummarizationRequest):
|
|
| 121 |
logger.info(f"[CHUNKING] content_id={item.content_id} num_chunks={len(chunks)}")
|
| 122 |
|
| 123 |
for chunk in chunks:
|
| 124 |
-
all_chunks.append(chunk)
|
| 125 |
chunk_map.append(item.content_id)
|
| 126 |
|
| 127 |
if not all_chunks:
|
|
|
|
| 29 |
|
| 30 |
# Token constraints
|
| 31 |
MAX_MODEL_TOKENS = 1024
|
| 32 |
+
SAFE_CHUNK_SIZE = 600 # Safe for aggregation
|
| 33 |
+
TRUNCATED_TOKENS = MAX_MODEL_TOKENS - 2 # Leave room for special tokens
|
| 34 |
|
| 35 |
# Pydantic schemas
|
| 36 |
class SummarizationItem(BaseModel):
|
|
|
|
| 80 |
|
| 81 |
return split_results
|
| 82 |
|
| 83 |
+
# Truncate text safely at token-level
|
| 84 |
+
def truncate_text(text: str, max_tokens: int = TRUNCATED_TOKENS) -> str:
|
| 85 |
+
tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 86 |
+
if len(tokens) <= max_tokens:
|
| 87 |
+
return text
|
| 88 |
+
truncated = tokens[:max_tokens]
|
| 89 |
+
return tokenizer.decode(truncated, skip_special_tokens=True)
|
| 90 |
+
|
| 91 |
# Chunking based on token length
|
| 92 |
def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
|
| 93 |
sentences = split_sentences(text)
|
|
|
|
| 130 |
logger.info(f"[CHUNKING] content_id={item.content_id} num_chunks={len(chunks)}")
|
| 131 |
|
| 132 |
for chunk in chunks:
|
| 133 |
+
all_chunks.append(truncate_text(chunk)) # ✅ enforce max length
|
| 134 |
chunk_map.append(item.content_id)
|
| 135 |
|
| 136 |
if not all_chunks:
|