Spaces:

spacesedan
/

summarizer

Sleeping

App Files Files Community

spacesedan commited on Mar 26, 2025

Commit

0bda3c0

1 Parent(s): cef4a12

making changes

Browse files

Files changed (3) hide show

Dockerfile +0 -10
app.py +25 -27
requirements.txt +0 -1

Dockerfile CHANGED Viewed

@@ -1,24 +1,14 @@
 FROM python:3.9
-# Create non-root user
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
-# Copy and install dependencies
 COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
-# Download NLTK 'punkt' to a known path
-RUN python -m nltk.downloader -d /home/user/nltk_data punkt
-# Set env so NLTK can find the punkt data
-ENV NLTK_DATA=/home/user/nltk_data
-# Copy app source
 COPY --chown=user . /app
-# Run the app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.9
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
 COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -4,31 +4,26 @@ from transformers import pipeline, AutoTokenizer
 from typing import List
 import logging
 import torch
-import nltk
-from nltk.tokenize import sent_tokenize
-# FastAPI app init
 app = FastAPI()
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("summarizer")
-# NLTK setup
-nltk.download("punkt")
-# Model config
 model_name = "sshleifer/distilbart-cnn-12-6"
 device = 0 if torch.cuda.is_available() else -1
 logger.info(f"Running summarizer on {'GPU' if device == 0 else 'CPU'}")
 summarizer = pipeline("summarization", model=model_name, device=device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Token limits
 MAX_MODEL_TOKENS = 1024
-SAFE_CHUNK_SIZE = 700  # Conservative chunk size to stay below 1024 after re-tokenization
-# Input/output schemas
 class SummarizationItem(BaseModel):
     content_id: str
     text: str
@@ -43,47 +38,50 @@ class SummarizationResponseItem(BaseModel):
 class BatchSummarizationResponse(BaseModel):
     summaries: List[SummarizationResponseItem]
-# New safe chunking logic using NLTK
 def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
-    sentences = sent_tokenize(text)
     chunks = []
-    current_chunk = ""
     for sentence in sentences:
-        temp_chunk = f"{current_chunk} {sentence}".strip()
-        token_count = len(tokenizer.encode(temp_chunk, truncation=False))
         if token_count <= max_tokens:
-            current_chunk = temp_chunk
         else:
-            if current_chunk:
-                chunks.append(current_chunk)
-            current_chunk = sentence
-    if current_chunk:
-        chunks.append(current_chunk)
     final_chunks = []
     for chunk in chunks:
         encoded = tokenizer(chunk, return_tensors="pt", truncation=False)
-        actual_len = encoded["input_ids"].shape[1]
-        if actual_len <= MAX_MODEL_TOKENS:
             final_chunks.append(chunk)
         else:
-            logger.warning(f"[CHUNKING] Dropped chunk due to re-encoding overflow: {actual_len} tokens")
     return final_chunks
-# Main summarization endpoint
 @app.post("/summarize", response_model=BatchSummarizationResponse)
 async def summarize_batch(request: BatchSummarizationRequest):
     all_chunks = []
     chunk_map = []
     for item in request.inputs:
-        token_count = len(tokenizer.encode(item.text, truncation=False))
         chunks = chunk_text(item.text)
-        logger.info(f"[CHUNKING] content_id={item.content_id} token_len={token_count} num_chunks={len(chunks)}")
         for chunk in chunks:
             all_chunks.append(chunk)

 from typing import List
 import logging
 import torch
+import re
 app = FastAPI()
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("summarizer")
+# Load model and tokenizer
 model_name = "sshleifer/distilbart-cnn-12-6"
 device = 0 if torch.cuda.is_available() else -1
 logger.info(f"Running summarizer on {'GPU' if device == 0 else 'CPU'}")
 summarizer = pipeline("summarization", model=model_name, device=device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Token constraints
 MAX_MODEL_TOKENS = 1024
+SAFE_CHUNK_SIZE = 700
+# Pydantic schemas
 class SummarizationItem(BaseModel):
     content_id: str
     text: str
 class BatchSummarizationResponse(BaseModel):
     summaries: List[SummarizationResponseItem]
+# Sentence-based chunking
+def split_sentences(text: str) -> list[str]:
+    return re.split(r'(?<=[.!?])\s+', text.strip())
 def chunk_text(text: str, max_tokens: int = SAFE_CHUNK_SIZE) -> List[str]:
+    sentences = split_sentences(text)
     chunks = []
+    current_chunk_sentences = []
     for sentence in sentences:
+        tentative_chunk = " ".join(current_chunk_sentences + [sentence])
+        token_count = len(tokenizer.encode(tentative_chunk, truncation=False))
         if token_count <= max_tokens:
+            current_chunk_sentences.append(sentence)
         else:
+            if current_chunk_sentences:
+                chunks.append(" ".join(current_chunk_sentences))
+            current_chunk_sentences = [sentence]
+    if current_chunk_sentences:
+        chunks.append(" ".join(current_chunk_sentences))
+    # Final filter: ensure nothing slipped through
     final_chunks = []
     for chunk in chunks:
         encoded = tokenizer(chunk, return_tensors="pt", truncation=False)
+        token_len = encoded["input_ids"].shape[1]
+        if token_len <= MAX_MODEL_TOKENS:
             final_chunks.append(chunk)
         else:
+            logger.warning(f"[CHUNKING] Dropped oversized chunk: {token_len} tokens")
     return final_chunks
+# Summarization endpoint
 @app.post("/summarize", response_model=BatchSummarizationResponse)
 async def summarize_batch(request: BatchSummarizationRequest):
     all_chunks = []
     chunk_map = []
     for item in request.inputs:
         chunks = chunk_text(item.text)
+        logger.info(f"[CHUNKING] content_id={item.content_id} num_chunks={len(chunks)}")
         for chunk in chunks:
             all_chunks.append(chunk)

requirements.txt CHANGED Viewed

@@ -2,5 +2,4 @@ fastapi
 uvicorn[standard]
 transformers
 torch
-nltk
 pydantic

 uvicorn[standard]
 transformers
 torch
 pydantic