import logging from typing import Optional, List, Dict, Any import torch from .preprocessing import TextPreprocessor, TechnicalDocumentParser from .models import SummarizationModelLoader, IntentClassifier, ContextPreserver from .utils import chunk_text from .evaluation import SummaryEvaluator from .keywords import KeywordExtractor from .exporters import SummaryExporter from .rag import RAGPipeline, ContextPreserver as RAGContextPreserver from .model_selector import ModelSelector from .intent_engine import ( get_intent_config, get_level_config, get_quality_config, extract_intent_relevant_text, build_t5_input, postprocess_summary, translate_summary, ) logger = logging.getLogger(__name__) class TechnicalDocumentSummarizer: """Main summarization pipeline for technical documents with language support.""" def __init__(self, model_name: str = 't5-small', device: Optional[str] = None, language: str = None): """ Initialize the summarizer. Args: model_name: Name of the model to use (default: t5-small for speed) device: Device to run on ('cpu' or 'cuda') language: Language for summarization (defaults to english if not provided) """ # Only prompt for language if running interactively and language not provided if language is None: import sys if sys.stdin.isatty(): # Check if running in interactive terminal print("\n=== LANGUAGE SELECTION ===") print("Supported languages: english, spanish, french, german, italian,") print("portuguese, chinese, japanese, korean, arabic, hindi, russian, turkish, vietnamese, thai") language = input("\nEnter desired language for summarization (default: english): ").strip() or 'english' else: language = 'english' # Default to English in non-interactive mode self.model_name = model_name self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu') self.language = language self.preprocessor = TextPreprocessor(remove_stopwords=False) self.parser = TechnicalDocumentParser() self.model_loader = SummarizationModelLoader(model_name, self.device, language) self.intent_classifier = IntentClassifier() self.context_preserver = ContextPreserver() self.model, self.tokenizer = self.model_loader.load_model() self.evaluator = SummaryEvaluator() self.keyword_extractor = KeywordExtractor() self.exporter = SummaryExporter() self.rag_pipeline = RAGPipeline() self.model_selector = ModelSelector() self.model_cache = {} logger.info(f"Summarizer initialized with {model_name} for {language}") def auto_summarize( self, document: str, intent: str = 'technical_overview', quality_preference: str = 'balanced', summary_level: str = 'brief', language: Optional[str] = None ) -> Dict[str, Any]: """ Auto-select best model and summarize with full intent / level / quality / language support. """ recommendation = self.model_selector.recommend_settings(document, quality_preference) logger.info(f"Model={recommendation['model']} | quality={quality_preference} | intent={intent} | level={summary_level}") use_rag = recommendation.get('use_rag', False) # Merge level + quality + intent configs for generation params level_cfg = get_level_config(summary_level) quality_cfg = get_quality_config(quality_preference) max_length = level_cfg['max_length'] min_length = level_cfg['min_length'] num_beams = max(recommendation.get('num_beams', 2), quality_cfg['num_beams']) summary = self.summarize( document, intent=intent, summary_level=summary_level, quality_preference=quality_preference, max_length=max_length, min_length=min_length, num_beams=num_beams, language=language, use_rag=use_rag, ) return { 'summary': summary, 'model': recommendation['model'], 'complexity': str(self.model_selector.current_complexity), 'use_rag': use_rag, 'estimated_time': recommendation['estimated_time'], 'reason': recommendation['reason'], } def summarize( self, document: str, intent: str = 'technical_overview', summary_level: str = 'brief', quality_preference: str = 'balanced', max_length: int = 130, min_length: int = 50, num_beams: int = 3, language: Optional[str] = None, use_rag: bool = False, ) -> str: """ Full intent-aware summarization pipeline: 1. Intent pre-filtering (sentence selection) 2. Optional RAG context retrieval 3. Model generation with quality-tuned params 4. Intent + level post-processing 5. Language translation (if non-English) """ if language and language != self.language: self.model_loader.language = language self.model_loader.language_code = self.model_loader.SUPPORTED_LANGUAGES.get( language.lower(), 'en_XX' ) if hasattr(self.tokenizer, 'src_lang'): self.tokenizer.src_lang = self.model_loader.language_code # Validate intent string if isinstance(intent, str): intent = self.intent_classifier.classify_intent(intent) # ── Step 1: Intent-aware pre-filtering ────────────────────────────── intent_text = extract_intent_relevant_text(document, intent, max_chars=3000) logger.info(f"[Summarize] intent={intent} level={summary_level} quality={quality_preference}") logger.info(f"[Summarize] pre-filter: {len(intent_text)} chars selected") # ── Step 2: Optional RAG ──────────────────────────────────────────── if use_rag: indexing_stats = self.rag_pipeline.index_document(intent_text) logger.info(f"RAG indexed: {indexing_stats}") intent_prompt = self.intent_classifier.get_prompt_for_intent(intent) retrieved_chunks = self.rag_pipeline.retrieve_context(intent_prompt, k=3) summary_text = self.rag_pipeline.merge_context( [chunk for chunk, _ in retrieved_chunks], [score for _, score in retrieved_chunks] ) else: summary_text = intent_text # ── Step 3: Generate ──────────────────────────────────────────────── quality_cfg = get_quality_config(quality_preference) raw_summary = self._generate_summary( summary_text, intent, max_length, min_length, num_beams, quality_cfg, ) # ── Step 4: Post-process (intent format + level formatting) ───────── formatted = postprocess_summary(raw_summary, intent, summary_level) # ── Step 5: Translate if non-English ──────────────────────────────── target_lang = language or self.language if target_lang and target_lang.lower() not in ('english', 'en'): formatted = translate_summary(formatted, target_lang) return formatted def _prepare_for_summarization( self, abstract: str, text: str, max_length: int ) -> str: """ Prepare text for summarization, handling long documents. Args: abstract: Document abstract if available text: Main document text max_length: Target max length Returns: Prepared text for summarization """ prepared = abstract if abstract else "" max_tokens = 512 * 2 if len(text) > 4000: chunks = chunk_text(text, chunk_size=1000, overlap=100) prepared += " ".join(chunks[:2]) else: prepared += " " + text return prepared.strip() def _generate_summary( self, text: str, intent: str, max_length: int, min_length: int, num_beams: int, quality_cfg: Optional[Dict[str, Any]] = None, ) -> str: """ Generate summary using the intent-aware T5 prefix. quality_cfg controls beam search and repetition penalty. """ if quality_cfg is None: quality_cfg = get_quality_config('balanced') input_text = build_t5_input(text, intent) inputs = self.tokenizer( input_text, return_tensors='pt', max_length=512, truncation=True, padding='max_length' ).to(self.device) with torch.no_grad(): summary_ids = self.model.generate( inputs['input_ids'], attention_mask=inputs.get('attention_mask'), max_length=max_length, min_length=min_length, num_beams=num_beams, early_stopping=True, do_sample=False, no_repeat_ngram_size=quality_cfg.get('no_repeat_ngram_size', 3), length_penalty=quality_cfg.get('length_penalty', 1.2), ) summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True) logger.info(f"[Generate] intent={intent} | words={len(summary.split())} | beams={num_beams}") return summary def _simplify_language(self, text: str) -> str: """ Simplify language to make summary easy to understand. Args: text: Text to simplify Returns: Simplified text """ simplifications = { 'utilize': 'use', 'demonstrate': 'show', 'implement': 'create', 'facilitate': 'help', 'novel': 'new', 'proposed': 'suggested', 'efficacy': 'effectiveness', 'robust': 'strong', 'comprehensive': 'complete', 'subsequent': 'next', 'aforementioned': 'mentioned', 'henceforth': 'from now on', } simplified = text for complex_word, simple_word in simplifications.items(): import re simplified = re.sub( r'\b' + complex_word + r'\b', simple_word, simplified, flags=re.IGNORECASE ) import re sentences = re.split(r'(?<=[.!?])\s+', simplified) simplified_sentences = [] for sentence in sentences: if len(sentence) > 120: # Split very long sentences parts = sentence.split(' and ') if len(parts) > 1: simplified_sentences.extend(parts) else: simplified_sentences.append(sentence) else: simplified_sentences.append(sentence) return ' '.join(simplified_sentences) def summarize_batch( self, documents: List[str], intent: str = 'technical_overview', language: Optional[str] = None, return_keywords: bool = False ) -> List[Dict[str, Any]]: """ Summarize multiple documents efficiently (batch processing). Args: documents: List of documents to summarize intent: Summarization intent language: Language for summarization return_keywords: Extract keywords for each Returns: List of summary results """ logger.info(f"Batch summarizing {len(documents)} documents...") results = [] for i, doc in enumerate(documents, 1): try: result = self.summarize( doc, intent=intent, language=language ) results.append(result) logger.info(f"Processed {i}/{len(documents)}") except Exception as e: logger.error(f"Error processing document {i}: {str(e)}") results.append({'error': str(e)}) logger.info(f"Batch summarization complete") return results def _format_as_bullets(self, summary: str) -> str: """ Format summary as bullet points. Args: summary: Summary text Returns: Formatted as bullets """ sentences = summary.split('.') bullets = [f"• {s.strip()}" for s in sentences if s.strip()] return '\n'.join(bullets) def summarize_with_sections( self, document: str, max_length_per_section: int = 100 ) -> Dict[str, str]: """ Summarize document with separate summaries for each section. Args: document: Document text max_length_per_section: Max length for each section summary Returns: Dictionary of section summaries """ sections = self.parser.extract_sections(document) summaries = {} for section_title, section_content in sections: if section_content.strip(): summary = self.summarize( section_content, max_length=max_length_per_section ) summaries[section_title] = summary return summaries def get_model_info(self) -> Dict[str, Any]: """Get information about the loaded model.""" return self.model_loader.get_model_info()