contexto-api / src /summarizer.py
Dev-ks04
feat: Contexto FastAPI backend - intent-aware summarization engine
39028c9
import logging
from typing import Optional, List, Dict, Any
import torch
from .preprocessing import TextPreprocessor, TechnicalDocumentParser
from .models import SummarizationModelLoader, IntentClassifier, ContextPreserver
from .utils import chunk_text
from .evaluation import SummaryEvaluator
from .keywords import KeywordExtractor
from .exporters import SummaryExporter
from .rag import RAGPipeline, ContextPreserver as RAGContextPreserver
from .model_selector import ModelSelector
from .intent_engine import (
get_intent_config,
get_level_config,
get_quality_config,
extract_intent_relevant_text,
build_t5_input,
postprocess_summary,
translate_summary,
)
logger = logging.getLogger(__name__)
class TechnicalDocumentSummarizer:
"""Main summarization pipeline for technical documents with language support."""
def __init__(self, model_name: str = 't5-small', device: Optional[str] = None, language: str = None):
"""
Initialize the summarizer.
Args:
model_name: Name of the model to use (default: t5-small for speed)
device: Device to run on ('cpu' or 'cuda')
language: Language for summarization (defaults to english if not provided)
"""
# Only prompt for language if running interactively and language not provided
if language is None:
import sys
if sys.stdin.isatty(): # Check if running in interactive terminal
print("\n=== LANGUAGE SELECTION ===")
print("Supported languages: english, spanish, french, german, italian,")
print("portuguese, chinese, japanese, korean, arabic, hindi, russian, turkish, vietnamese, thai")
language = input("\nEnter desired language for summarization (default: english): ").strip() or 'english'
else:
language = 'english' # Default to English in non-interactive mode
self.model_name = model_name
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.language = language
self.preprocessor = TextPreprocessor(remove_stopwords=False)
self.parser = TechnicalDocumentParser()
self.model_loader = SummarizationModelLoader(model_name, self.device, language)
self.intent_classifier = IntentClassifier()
self.context_preserver = ContextPreserver()
self.model, self.tokenizer = self.model_loader.load_model()
self.evaluator = SummaryEvaluator()
self.keyword_extractor = KeywordExtractor()
self.exporter = SummaryExporter()
self.rag_pipeline = RAGPipeline()
self.model_selector = ModelSelector()
self.model_cache = {}
logger.info(f"Summarizer initialized with {model_name} for {language}")
def auto_summarize(
self,
document: str,
intent: str = 'technical_overview',
quality_preference: str = 'balanced',
summary_level: str = 'brief',
language: Optional[str] = None
) -> Dict[str, Any]:
"""
Auto-select best model and summarize with full intent / level / quality / language support.
"""
recommendation = self.model_selector.recommend_settings(document, quality_preference)
logger.info(f"Model={recommendation['model']} | quality={quality_preference} | intent={intent} | level={summary_level}")
use_rag = recommendation.get('use_rag', False)
# Merge level + quality + intent configs for generation params
level_cfg = get_level_config(summary_level)
quality_cfg = get_quality_config(quality_preference)
max_length = level_cfg['max_length']
min_length = level_cfg['min_length']
num_beams = max(recommendation.get('num_beams', 2), quality_cfg['num_beams'])
summary = self.summarize(
document,
intent=intent,
summary_level=summary_level,
quality_preference=quality_preference,
max_length=max_length,
min_length=min_length,
num_beams=num_beams,
language=language,
use_rag=use_rag,
)
return {
'summary': summary,
'model': recommendation['model'],
'complexity': str(self.model_selector.current_complexity),
'use_rag': use_rag,
'estimated_time': recommendation['estimated_time'],
'reason': recommendation['reason'],
}
def summarize(
self,
document: str,
intent: str = 'technical_overview',
summary_level: str = 'brief',
quality_preference: str = 'balanced',
max_length: int = 130,
min_length: int = 50,
num_beams: int = 3,
language: Optional[str] = None,
use_rag: bool = False,
) -> str:
"""
Full intent-aware summarization pipeline:
1. Intent pre-filtering (sentence selection)
2. Optional RAG context retrieval
3. Model generation with quality-tuned params
4. Intent + level post-processing
5. Language translation (if non-English)
"""
if language and language != self.language:
self.model_loader.language = language
self.model_loader.language_code = self.model_loader.SUPPORTED_LANGUAGES.get(
language.lower(), 'en_XX'
)
if hasattr(self.tokenizer, 'src_lang'):
self.tokenizer.src_lang = self.model_loader.language_code
# Validate intent string
if isinstance(intent, str):
intent = self.intent_classifier.classify_intent(intent)
# ── Step 1: Intent-aware pre-filtering ──────────────────────────────
intent_text = extract_intent_relevant_text(document, intent, max_chars=3000)
logger.info(f"[Summarize] intent={intent} level={summary_level} quality={quality_preference}")
logger.info(f"[Summarize] pre-filter: {len(intent_text)} chars selected")
# ── Step 2: Optional RAG ────────────────────────────────────────────
if use_rag:
indexing_stats = self.rag_pipeline.index_document(intent_text)
logger.info(f"RAG indexed: {indexing_stats}")
intent_prompt = self.intent_classifier.get_prompt_for_intent(intent)
retrieved_chunks = self.rag_pipeline.retrieve_context(intent_prompt, k=3)
summary_text = self.rag_pipeline.merge_context(
[chunk for chunk, _ in retrieved_chunks],
[score for _, score in retrieved_chunks]
)
else:
summary_text = intent_text
# ── Step 3: Generate ────────────────────────────────────────────────
quality_cfg = get_quality_config(quality_preference)
raw_summary = self._generate_summary(
summary_text, intent, max_length, min_length,
num_beams, quality_cfg,
)
# ── Step 4: Post-process (intent format + level formatting) ─────────
formatted = postprocess_summary(raw_summary, intent, summary_level)
# ── Step 5: Translate if non-English ────────────────────────────────
target_lang = language or self.language
if target_lang and target_lang.lower() not in ('english', 'en'):
formatted = translate_summary(formatted, target_lang)
return formatted
def _prepare_for_summarization(
self,
abstract: str,
text: str,
max_length: int
) -> str:
"""
Prepare text for summarization, handling long documents.
Args:
abstract: Document abstract if available
text: Main document text
max_length: Target max length
Returns:
Prepared text for summarization
"""
prepared = abstract if abstract else ""
max_tokens = 512 * 2
if len(text) > 4000:
chunks = chunk_text(text, chunk_size=1000, overlap=100)
prepared += " ".join(chunks[:2])
else:
prepared += " " + text
return prepared.strip()
def _generate_summary(
self,
text: str,
intent: str,
max_length: int,
min_length: int,
num_beams: int,
quality_cfg: Optional[Dict[str, Any]] = None,
) -> str:
"""
Generate summary using the intent-aware T5 prefix.
quality_cfg controls beam search and repetition penalty.
"""
if quality_cfg is None:
quality_cfg = get_quality_config('balanced')
input_text = build_t5_input(text, intent)
inputs = self.tokenizer(
input_text,
return_tensors='pt',
max_length=512,
truncation=True,
padding='max_length'
).to(self.device)
with torch.no_grad():
summary_ids = self.model.generate(
inputs['input_ids'],
attention_mask=inputs.get('attention_mask'),
max_length=max_length,
min_length=min_length,
num_beams=num_beams,
early_stopping=True,
do_sample=False,
no_repeat_ngram_size=quality_cfg.get('no_repeat_ngram_size', 3),
length_penalty=quality_cfg.get('length_penalty', 1.2),
)
summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
logger.info(f"[Generate] intent={intent} | words={len(summary.split())} | beams={num_beams}")
return summary
def _simplify_language(self, text: str) -> str:
"""
Simplify language to make summary easy to understand.
Args:
text: Text to simplify
Returns:
Simplified text
"""
simplifications = {
'utilize': 'use',
'demonstrate': 'show',
'implement': 'create',
'facilitate': 'help',
'novel': 'new',
'proposed': 'suggested',
'efficacy': 'effectiveness',
'robust': 'strong',
'comprehensive': 'complete',
'subsequent': 'next',
'aforementioned': 'mentioned',
'henceforth': 'from now on',
}
simplified = text
for complex_word, simple_word in simplifications.items():
import re
simplified = re.sub(
r'\b' + complex_word + r'\b',
simple_word,
simplified,
flags=re.IGNORECASE
)
import re
sentences = re.split(r'(?<=[.!?])\s+', simplified)
simplified_sentences = []
for sentence in sentences:
if len(sentence) > 120: # Split very long sentences
parts = sentence.split(' and ')
if len(parts) > 1:
simplified_sentences.extend(parts)
else:
simplified_sentences.append(sentence)
else:
simplified_sentences.append(sentence)
return ' '.join(simplified_sentences)
def summarize_batch(
self,
documents: List[str],
intent: str = 'technical_overview',
language: Optional[str] = None,
return_keywords: bool = False
) -> List[Dict[str, Any]]:
"""
Summarize multiple documents efficiently (batch processing).
Args:
documents: List of documents to summarize
intent: Summarization intent
language: Language for summarization
return_keywords: Extract keywords for each
Returns:
List of summary results
"""
logger.info(f"Batch summarizing {len(documents)} documents...")
results = []
for i, doc in enumerate(documents, 1):
try:
result = self.summarize(
doc,
intent=intent,
language=language
)
results.append(result)
logger.info(f"Processed {i}/{len(documents)}")
except Exception as e:
logger.error(f"Error processing document {i}: {str(e)}")
results.append({'error': str(e)})
logger.info(f"Batch summarization complete")
return results
def _format_as_bullets(self, summary: str) -> str:
"""
Format summary as bullet points.
Args:
summary: Summary text
Returns:
Formatted as bullets
"""
sentences = summary.split('.')
bullets = [f"• {s.strip()}" for s in sentences if s.strip()]
return '\n'.join(bullets)
def summarize_with_sections(
self,
document: str,
max_length_per_section: int = 100
) -> Dict[str, str]:
"""
Summarize document with separate summaries for each section.
Args:
document: Document text
max_length_per_section: Max length for each section summary
Returns:
Dictionary of section summaries
"""
sections = self.parser.extract_sections(document)
summaries = {}
for section_title, section_content in sections:
if section_content.strip():
summary = self.summarize(
section_content,
max_length=max_length_per_section
)
summaries[section_title] = summary
return summaries
def get_model_info(self) -> Dict[str, Any]:
"""Get information about the loaded model."""
return self.model_loader.get_model_info()