Spaces:
Runtime error
Runtime error
| from transformers import PegasusForConditionalGeneration, PegasusTokenizer, pipeline | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer as NLLBTokenizer | |
| from transformers import M2M100ForConditionalGeneration, M2M100Config | |
| # from langdetect import detect, LangDetectException | |
| import torch | |
| import os | |
| _ft_model = None | |
| _FASTTEXT_MODEL_PATH = "lid.176.bin" | |
| _ENGLISH_CONFIDENCE_THRESHOLD = 0.95 | |
| try: | |
| import fasttext | |
| if os.path.exists(_FASTTEXT_MODEL_PATH): | |
| print(f"Loading fastText language identification model from: {_FASTTEXT_MODEL_PATH}") | |
| _ft_model = fasttext.load_model(_FASTTEXT_MODEL_PATH) | |
| print("fastText model loaded successfully.") | |
| else: | |
| print(f"Warning: fastText model file not found at {_FASTTEXT_MODEL_PATH}.") | |
| _ft_model = None | |
| except ImportError: | |
| print("Warning: fastText library not installed.") | |
| _ft_model = None | |
| except Exception as e: | |
| print(f"Error loading fastText model: {e}") | |
| _ft_model = None | |
| def get_language_code(text: str) -> tuple[str | None, str | None, float | None, bool]: | |
| """ | |
| Detects language using fastText and returns a tuple: | |
| (detected_code_raw, nllb_compatible_code, confidence, force_translation_to_english_flag) | |
| The last flag is True if detected as English but confidence is below threshold. | |
| """ | |
| global _ft_model | |
| detected_code_raw = None | |
| confidence = 0.0 | |
| force_translation = False | |
| if not _ft_model: | |
| print("Error: fastText model not available for language detection.") | |
| return None, None, None, False | |
| try: | |
| cleaned_text = text.replace("\n", " ").replace("\r", " ") | |
| if not cleaned_text.strip(): | |
| print("Warning: Input text is empty or whitespace only for language detection.") | |
| return None, None, None, False | |
| predictions = _ft_model.predict(cleaned_text, k=1) | |
| if predictions and predictions[0]: | |
| detected_code_raw = predictions[0][0].replace('__label__', '') | |
| confidence = predictions[1][0] | |
| print(f"fastText detected: {detected_code_raw} with confidence {confidence:.4f}") | |
| # Check for low confidence English | |
| if (detected_code_raw == 'en' or detected_code_raw == 'eng') and confidence < _ENGLISH_CONFIDENCE_THRESHOLD: | |
| print(f"English detected with low confidence ({confidence:.4f} < {_ENGLISH_CONFIDENCE_THRESHOLD}). Flagging for potential translation.") | |
| force_translation = True | |
| else: | |
| print("fastText could not predict language.") | |
| return None, None, None, False | |
| except Exception as e: | |
| print(f"Error during fastText language detection: {e}") | |
| return None, None, None, False | |
| if not detected_code_raw: | |
| return None, None, None, False | |
| # --- NLLB Language Code Mapping --- | |
| # NLLB uses Flores-200 codes | |
| nllb_lang_map = { | |
| 'en': 'eng_Latn', 'eng': 'eng_Latn', | |
| 'es': 'spa_Latn', 'spa': 'spa_Latn', | |
| 'fr': 'fra_Latn', | |
| 'de': 'deu_Latn', 'ger': 'deu_Latn', | |
| 'it': 'ita_Latn', | |
| 'pt': 'por_Latn', | |
| 'zh': 'zho_Hans', | |
| 'zh-cn': 'zho_Hans', | |
| 'zh-tw': 'zho_Hant', | |
| 'ja': 'jpn_Jpan', 'jpn': 'jpn_Jpan', | |
| 'ko': 'kor_Hang', 'kor': 'kor_Hang', | |
| 'ar': 'ara_Arab', | |
| 'hi': 'hin_Deva', | |
| 'ru': 'rus_Cyrl', | |
| 'bn': 'ben_Beng', | |
| 'pa': 'pan_Guru', | |
| 'ur': 'urd_Arab', | |
| 'ta': 'tam_Taml', | |
| 'te': 'tel_Telu', | |
| 'ml': 'mal_Mlym', | |
| 'gu': 'guj_Gujr', | |
| 'mr': 'mar_Deva', | |
| 'id': 'ind_Latn', 'ind': 'ind_Latn', | |
| 'vi': 'vie_Latn', 'vie': 'vie_Latn', | |
| 'th': 'tha_Thai', | |
| 'tr': 'tur_Latn', | |
| 'fa': 'pes_Arab', 'per': 'pes_Arab', | |
| 'uk': 'ukr_Cyrl', | |
| 'pl': 'pol_Latn', | |
| 'nl': 'nld_Latn', 'dut': 'nld_Latn', | |
| 'ro': 'ron_Latn', 'rum': 'ron_Latn', | |
| 'cs': 'ces_Latn', 'cze': 'ces_Latn', | |
| 'sv': 'swe_Latn', | |
| 'fi': 'fin_Latn', | |
| 'da': 'dan_Latn', | |
| 'no': 'nob_Latn', | |
| 'el': 'ell_Grek', 'gre': 'ell_Grek', | |
| 'he': 'heb_Hebr', | |
| 'hu': 'hun_Latn', | |
| 'bg': 'bul_Cyrl', | |
| 'sr': 'srp_Cyrl', | |
| 'hr': 'hrv_Latn', | |
| 'sk': 'slk_Latn', | |
| 'sl': 'slv_Latn', | |
| 'et': 'est_Latn', | |
| 'lv': 'lav_Latn', | |
| 'lt': 'lit_Latn', | |
| 'sw': 'swh_Latn', | |
| 'am': 'amh_Ethi', | |
| 'yo': 'yor_Latn', | |
| 'ig': 'ibo_Latn', | |
| 'zu': 'zul_Latn', | |
| 'xh': 'xho_Latn', | |
| 'my': 'mya_Mymr', | |
| 'km': 'khm_Khmr', | |
| 'lo': 'lao_Laoo', | |
| 'ne': 'npi_Deva', | |
| 'si': 'sin_Sinh', | |
| 'az': 'azj_Latn', | |
| 'kk': 'kaz_Cyrl', | |
| 'uz': 'uzn_Latn', | |
| 'mn': 'khk_Cyrl', | |
| 'ps': 'pbt_Arab', | |
| 'tg': 'tgk_Cyrl', | |
| 'tk': 'tuk_Latn', | |
| 'so': 'som_Latn', | |
| } | |
| nllb_code = nllb_lang_map.get(detected_code_raw) | |
| if not nllb_code and '-' in detected_code_raw: | |
| nllb_code = nllb_lang_map.get(detected_code_raw.split('-')[0]) | |
| if not nllb_code: | |
| print(f"Warning: No NLLB mapping for detected language code '{detected_code_raw}'.") | |
| return detected_code_raw, nllb_code, confidence, force_translation | |
| class Summarizer: | |
| def __init__(self, | |
| summarizer_model_name="google/pegasus-cnn_dailymail", | |
| translator_model_name="facebook/nllb-200-distilled-1.3B"): # NLLB model | |
| self.summarizer_model_name = summarizer_model_name | |
| self.translator_model_name = translator_model_name | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {self.device}") | |
| self.pegasus_tokenizer = None | |
| self.pegasus_model = None | |
| self.translator_tokenizer = None | |
| self.translator_model = None | |
| if "pegasus" in self.summarizer_model_name.lower(): | |
| self.effective_input_token_limit = 512 | |
| # elif "bart" in self.summarizer_model_name.lower(): # mbart is a bart model | |
| # self.effective_input_token_limit = 1024 | |
| else: # Default | |
| self.effective_input_token_limit = 512 | |
| # Add effective input token limit for the translator model | |
| self.translator_effective_input_token_limit = 500 | |
| self._load_models() | |
| def _load_models(self): | |
| try: | |
| print(f"Loading Pegasus tokenizer: {self.summarizer_model_name}...") | |
| self.pegasus_tokenizer = PegasusTokenizer.from_pretrained(self.summarizer_model_name) | |
| print(f"{self.summarizer_model_name} tokenizer loaded.") | |
| print(f"Loading Pegasus model: {self.summarizer_model_name}...") | |
| self.pegasus_model = PegasusForConditionalGeneration.from_pretrained(self.summarizer_model_name).to(self.device) | |
| print(f"{self.summarizer_model_name} model loaded.") | |
| print(f"Loading NLLB tokenizer: {self.translator_model_name}...") | |
| self.translator_tokenizer = NLLBTokenizer.from_pretrained(self.translator_model_name) | |
| print(f"NLLB tokenizer type: {type(self.translator_tokenizer)}") | |
| print("NLLB tokenizer loaded.") | |
| print(f"Loading NLLB model config using M2M100Config: {self.translator_model_name}...") | |
| translator_config = M2M100Config.from_pretrained(self.translator_model_name) | |
| print(f"Explicit NLLB/M2M100 config type: {type(translator_config)}") | |
| print(f"Loading NLLB model using M2M100ForConditionalGeneration: {self.translator_model_name}...") | |
| self.translator_model = M2M100ForConditionalGeneration.from_pretrained( | |
| self.translator_model_name, | |
| config=translator_config | |
| ).to(self.device) | |
| print(f"NLLB model type: {type(self.translator_model)}") | |
| print("NLLB model loaded.") | |
| # --- DEBUGGING --- | |
| # if self.translator_model and hasattr(self.translator_model, 'config'): | |
| # print(f"NLLB model config type: {type(self.translator_model.config)}") # Should be M2M100Config | |
| # print(f"NLLB model config keys: {list(self.translator_model.config.to_dict().keys())}") | |
| # if hasattr(self.translator_model.config, 'lang_code_to_id'): | |
| # print("SUCCESS: 'lang_code_to_id' FOUND in translator_model.config") | |
| # else: | |
| # print("FAILURE: 'lang_code_to_id' NOT FOUND in translator_model.config") | |
| # else: | |
| # print("NLLB model or its config is None after loading attempts.") | |
| # --- END DEBUGGING --- | |
| except Exception as e: | |
| print(f"Error loading models: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| raise | |
| def _translate_text(self, text_to_translate: str, src_nllb_lang: str, tgt_nllb_lang: str) -> str | None: | |
| if not self.translator_model or not self.translator_tokenizer: | |
| print("Translator model/tokenizer not loaded.") | |
| return None | |
| if not src_nllb_lang: | |
| print(f"Missing NLLB source language code for translation. Cannot translate.") | |
| return text_to_translate # Keep original behavior | |
| if not text_to_translate.strip(): | |
| return "" # Handle empty string input explicitly | |
| token_limit = self.translator_effective_input_token_limit | |
| try: | |
| # Set the source language for the NLLB tokenizer | |
| self.translator_tokenizer.src_lang = src_nllb_lang | |
| all_input_ids = self.translator_tokenizer.encode(text_to_translate, add_special_tokens=False) | |
| total_tokens = len(all_input_ids) | |
| if total_tokens == 0: | |
| print(f"Warning: Text resulted in 0 tokens after encoding. Original text (first 100 chars): '{text_to_translate[:100]}...'") | |
| return "" | |
| translated_parts = [] | |
| if total_tokens <= token_limit: | |
| print(f"DEBUG: Translating text as a single chunk ({total_tokens} tokens).") | |
| inputs = self.translator_tokenizer( | |
| text_to_translate, | |
| return_tensors="pt", | |
| truncation=True, | |
| padding=True, | |
| max_length=token_limit | |
| ).to(self.device) | |
| try: | |
| target_lang_token_id = self.translator_tokenizer.convert_tokens_to_ids(tgt_nllb_lang) | |
| except Exception as e_conv: | |
| print(f"Error converting target language code '{tgt_nllb_lang}' to ID: {e_conv}") | |
| return None | |
| if target_lang_token_id == self.translator_tokenizer.unk_token_id: | |
| print(f"Warning: Target language code '{tgt_nllb_lang}' was converted to UNK token ID.") | |
| return None | |
| generated_tokens = self.translator_model.generate( | |
| **inputs, | |
| forced_bos_token_id=target_lang_token_id, | |
| max_length=1024, | |
| repetition_penalty=1.2, | |
| no_repeat_ngram_size=3 | |
| ) | |
| translated_text_segment = self.translator_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] | |
| translated_parts.append(translated_text_segment) | |
| else: | |
| print(f"Input text for translation has {total_tokens} tokens. Applying chunking (token limit for tokenizer: {token_limit}).") | |
| # Pattern similar to summarizer chunking: | |
| # chunk_size for slicing from all_input_ids | |
| chunk_slice_size = token_limit - 50 | |
| if chunk_slice_size <= 0: # Ensure positive slice size | |
| chunk_slice_size = token_limit // 2 if token_limit > 1 else 1 | |
| overlap_tokens = 50 | |
| if overlap_tokens >= chunk_slice_size and chunk_slice_size > 0: | |
| overlap_tokens = chunk_slice_size // 3 | |
| elif chunk_slice_size == 0: | |
| overlap_tokens = 0 | |
| start_idx = 0 | |
| chunk_num = 1 | |
| while start_idx < total_tokens: | |
| end_idx = min(start_idx + chunk_slice_size, total_tokens) | |
| chunk_token_ids_for_decode = all_input_ids[start_idx:end_idx] | |
| current_chunk_text_to_translate = self.translator_tokenizer.decode( | |
| chunk_token_ids_for_decode, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True | |
| ) | |
| if not current_chunk_text_to_translate.strip(): | |
| print(f"DEBUG: Skipping empty decoded chunk {chunk_num}.") | |
| # Advance start_idx | |
| if end_idx == total_tokens: break | |
| step = chunk_slice_size - overlap_tokens | |
| start_idx += step if step > 0 else 1 | |
| if start_idx >= end_idx and end_idx < total_tokens : start_idx = end_idx # Ensure progress | |
| chunk_num += 1 | |
| continue | |
| print(f"DEBUG: Translating chunk {chunk_num}. Input token slice: {start_idx}-{end_idx-1}. Decoded text length: {len(current_chunk_text_to_translate)}") | |
| inputs = self.translator_tokenizer( | |
| current_chunk_text_to_translate, | |
| return_tensors="pt", | |
| truncation=True, | |
| padding=True, | |
| max_length=token_limit # Tokenizer will cap the input from this text chunk | |
| ).to(self.device) | |
| try: | |
| target_lang_token_id = self.translator_tokenizer.convert_tokens_to_ids(tgt_nllb_lang) | |
| except Exception as e_conv: | |
| print(f"Error converting target language code '{tgt_nllb_lang}' to ID for chunk {chunk_num}: {e_conv}") | |
| return None | |
| if target_lang_token_id == self.translator_tokenizer.unk_token_id: | |
| print(f"Warning: Target language code '{tgt_nllb_lang}' was UNK for chunk {chunk_num}.") | |
| return None | |
| generated_tokens = self.translator_model.generate( | |
| **inputs, | |
| forced_bos_token_id=target_lang_token_id, | |
| max_length=1024, | |
| repetition_penalty=1.2, | |
| no_repeat_ngram_size=3 | |
| ) | |
| translated_segment = self.translator_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] | |
| translated_parts.append(translated_segment) | |
| chunk_num +=1 | |
| if end_idx == total_tokens: | |
| break | |
| step = chunk_slice_size - overlap_tokens | |
| start_idx += step if step > 0 else 1 # Ensure progress even if step is not positive | |
| # Safety break if start_idx doesn't advance properly (should not be needed with step logic) | |
| if start_idx >= total_tokens and end_idx < total_tokens: break | |
| if start_idx >= end_idx and end_idx < total_tokens : start_idx = end_idx | |
| final_translated_text = " ".join(translated_parts).strip() | |
| return final_translated_text | |
| except Exception as e: | |
| print(f"Error during translation from {src_nllb_lang} to {tgt_nllb_lang}: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None | |
| def _summarize_english_text(self, text_chunk: str, min_length: int, max_length: int) -> str: | |
| # Refined from your version for clarity and robustness | |
| if not self.pegasus_model or not self.pegasus_tokenizer: | |
| print("Error: Pegasus model/tokenizer not loaded.") | |
| return "Error: Summarization model not available." | |
| try: | |
| # Provide default values if None | |
| effective_min_length = min_length if min_length is not None else 30 | |
| effective_max_length = max_length if max_length is not None else 128 | |
| inputs = self.pegasus_tokenizer( | |
| text_chunk, return_tensors="pt", truncation=True, max_length=self.effective_input_token_limit | |
| ).to(self.device) | |
| summary_ids = self.pegasus_model.generate( | |
| inputs["input_ids"], | |
| num_beams=4, | |
| min_length=effective_min_length, | |
| max_length=effective_max_length, | |
| early_stopping=True | |
| ) | |
| summary_text_raw = self.pegasus_tokenizer.decode( | |
| summary_ids[0], | |
| skip_special_tokens=True, # Important | |
| clean_up_tokenization_spaces=True | |
| ) | |
| # Post-processing for <n> tokens | |
| processed_summary = summary_text_raw.replace("<n>", "\n") | |
| lines = [line.strip() for line in processed_summary.split('\n')] | |
| processed_summary = "\n".join(filter(None, lines)) # Removes empty lines | |
| return processed_summary.strip() | |
| except Exception as e: | |
| print(f"Error during Pegasus summarization of chunk: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return f"Error summarizing chunk: {e}" | |
| def summarize(self, text: str, min_length_per_chunk: int = None, max_length_per_chunk: int = None, | |
| overall_min_length: int = None, overall_max_length: int = None) -> dict: | |
| result = { | |
| 'final_summary': None, 'detected_language_raw': None, 'detected_language_confidence': None, | |
| 'english_translation': None, 'english_summary': None, 'error': None, | |
| 'translation_performed': False # New key to indicate if translation to English happened | |
| } | |
| if not all([self.pegasus_model, self.pegasus_tokenizer, self.translator_model, self.translator_tokenizer]): | |
| result['error'] = "Error: Core models not loaded." | |
| return result | |
| if _ft_model is None: | |
| result['error'] = "Error: Language detection model (fastText) not available." | |
| return result | |
| if not text.strip(): | |
| result['error'] = "Error: Input text is empty." | |
| return result | |
| original_text_to_process = text | |
| detected_lang_raw, detected_lang_nllb, confidence, force_translation_flag = get_language_code(text) | |
| result['detected_language_raw'] = detected_lang_raw | |
| result['detected_language_confidence'] = confidence | |
| if not detected_lang_raw: | |
| result['error'] = "Error: Could not detect language." | |
| return result | |
| # Determine if translation to English is needed | |
| needs_translation_to_english = False | |
| if (detected_lang_raw != 'en' and detected_lang_raw != 'eng'): | |
| needs_translation_to_english = True | |
| elif force_translation_flag: | |
| print(f"INFO: Detected as English ('{detected_lang_raw}') but confidence {confidence} is low. Attempting 'translation' to English to normalize.") | |
| needs_translation_to_english = True | |
| if not detected_lang_nllb: | |
| detected_lang_nllb = 'eng_Latn' | |
| result['translation_performed'] = True | |
| if needs_translation_to_english: | |
| if detected_lang_nllb: | |
| print(f"Original language: {detected_lang_raw} (NLLB src: {detected_lang_nllb}). Translating to English (eng_Latn)...") | |
| english_text_translation = self._translate_text(original_text_to_process, detected_lang_nllb, "eng_Latn") | |
| result['translation_performed'] = True | |
| if not english_text_translation or english_text_translation.startswith("Error"): | |
| err_msg = f"Error: Translation to English failed for lang {detected_lang_raw}." | |
| result['error'] = err_msg | |
| return result | |
| original_text_to_process = english_text_translation | |
| result['english_translation'] = original_text_to_process | |
| print("Translation to English complete.") | |
| else: # Non-English detected by fastText, but no NLLB mapping for it | |
| print(f"Critical: Original language detected as '{detected_lang_raw}', but no NLLB code mapping found. Cannot reliably translate.") | |
| result['error'] = f"Error: Input language '{detected_lang_raw}' is not supported for translation (no NLLB mapping)." | |
| return result | |
| # --- Summarization of English text (original_text_to_process is now English) --- | |
| all_input_ids = self.pegasus_tokenizer.encode(original_text_to_process, add_special_tokens=False) | |
| total_tokens = len(all_input_ids) | |
| english_summary_text = None | |
| if total_tokens <= self.effective_input_token_limit: | |
| english_summary_text = self._summarize_english_text(original_text_to_process, overall_min_length, overall_max_length) | |
| else: | |
| # (Your full chunking logic here) | |
| print(f"Input text for summarizer has {total_tokens} tokens. Applying chunking...") | |
| chunk_size = self.effective_input_token_limit - 50 | |
| overlap_size = 50 | |
| chunks_texts = [] | |
| start_idx = 0 | |
| while start_idx < total_tokens: | |
| end_idx = min(start_idx + chunk_size, total_tokens) | |
| chunk_token_ids = all_input_ids[start_idx:end_idx] | |
| chunk_text_for_summary = self.pegasus_tokenizer.decode(chunk_token_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
| chunks_texts.append(chunk_text_for_summary) | |
| if end_idx == total_tokens: break | |
| start_idx += (chunk_size - overlap_size) | |
| if start_idx >= end_idx : break | |
| if not chunks_texts: | |
| result['error'] = "Error: Failed to create any chunks from the (potentially translated) text." | |
| return result | |
| chunk_summaries_list = [] | |
| for i_chunk, chunk_text_to_summarize in enumerate(chunks_texts): | |
| print(f"Summarizing (English) chunk {i_chunk+1}/{len(chunks_texts)}...") | |
| chunk_sum = self._summarize_english_text(chunk_text_to_summarize, min_length_per_chunk, max_length_per_chunk) | |
| if chunk_sum.startswith("Error"): | |
| result['error'] = f"Error summarizing chunk {i_chunk+1}: {chunk_sum.split(':', 1)[1].strip() if ':' in chunk_sum else chunk_sum}" | |
| return result | |
| chunk_summaries_list.append(chunk_sum) | |
| english_summary_text = " ".join(chunk_summaries_list) | |
| if not english_summary_text or english_summary_text.startswith("Error"): | |
| err_msg_sum = f"Error during English summarization" | |
| if english_summary_text and english_summary_text.startswith("Error"): | |
| err_msg_sum += f": {english_summary_text.split(':', 1)[1].strip() if ':' in english_summary_text else english_summary_text}" | |
| elif not english_summary_text: | |
| err_msg_sum += ": Summarization returned no text." | |
| result['error'] = err_msg_sum | |
| return result | |
| result['english_summary'] = english_summary_text | |
| # --- Translate summary back --- | |
| final_summary_text = english_summary_text | |
| if result['translation_performed'] and not force_translation_flag: # if force_translation_flag, it was already "English" | |
| if detected_lang_raw != 'en' and detected_lang_raw != 'eng' and detected_lang_nllb: | |
| print(f"Translating summary back to {detected_lang_raw} (NLLB: {detected_lang_nllb})...") | |
| translated_summary_back = self._translate_text(english_summary_text, "eng_Latn", detected_lang_nllb) | |
| if translated_summary_back and not translated_summary_back.startswith("Error"): | |
| final_summary_text = translated_summary_back | |
| print("Back-translation complete.") | |
| else: | |
| warning_msg = f"Warning: Back-translation to {detected_lang_raw} failed or returned error. Returning English summary." | |
| if translated_summary_back and translated_summary_back.startswith("Error"): | |
| warning_msg += f" Details: {translated_summary_back.split(':', 1)[1].strip() if ':' in translated_summary_back else translated_summary_back}" | |
| elif not translated_summary_back: | |
| warning_msg += " Back-translation function returned None." | |
| print(warning_msg) | |
| result['final_summary'] = final_summary_text | |
| return result | |
| if __name__ == "__main__": | |
| print("Testing Summarizer Engine...") | |
| if _ft_model is None: | |
| print("CRITICAL: fastText model not loaded. Please ensure 'lid.176.bin' is present and readable.") | |
| print("Skipping direct engine tests.") | |
| else: | |
| try: | |
| engine = Summarizer() | |
| print("\n--- Testing English Article ---") | |
| short_article_en = "Apple today announced its new iPhone, which features a faster processor and an improved camera. The company expects strong sales." | |
| print(f"Input (EN):\n{short_article_en}") | |
| summary_en_result = engine.summarize(short_article_en, overall_min_length=10, overall_max_length=50) | |
| print(f"Result (EN): {summary_en_result}") | |
| print("\n--- Testing Chinese Article (Short) ---") | |
| short_article_zh = "今天天气真好,阳光明媚。" | |
| print(f"Input (ZH):\n{short_article_zh}") | |
| summary_zh_result = engine.summarize(short_article_zh, overall_min_length=5, overall_max_length=30) | |
| print(f"Result (ZH): {summary_zh_result}") | |
| print("\n--- Testing Mixed Language Article ---") | |
| mixed_article = '今天天气真好 How are you? 大家都在玩' | |
| print(f"Input (Mixed):\n{mixed_article}") | |
| summary_mixed_result = engine.summarize(mixed_article, overall_min_length=5, overall_max_length=40) | |
| print(f"Result (Mixed): {summary_mixed_result}") | |
| print("\n--- Testing Long English Article ---") | |
| long_article_en = """ | |
| Warmer weather is finally here in the Northern Hemisphere, and with it, many pools and beaches are opening for the summer. That’s great for families who want to spend time by the water, but it’s also a good time to be reminded about the importance of water safety. | |
| An estimated 4,000 fatal unintentional drownings happen every year in the United States — an average of 11 drowning deaths per day — according to the Centers for Disease Control and Prevention. | |
| Drowning is the leading cause of death in children ages 1 to 4, and after motor vehicle accidents, it is the second leading cause of death attributed to unintentional injuries among kids ages 5 to 14. | |
| I wanted to speak more about water safety with CNN wellness expert Dr. Leana Wen. Wen is an emergency physician and adjunct associate professor at George Washington University who previously served as Baltimore’s health commissioner. She is also the mother of two young children, and as someone who learned to swim as an adult, she is a passionate advocate for kids — and adults — learning to swim. | |
| CNN: Who is most at risk of drowning, and under what circumstances? | |
| Dr. Leana Wen: The CDC issued an important report in 2024 about unintentional drowning deaths in the United States. Death rates were significantly higher in 2020, 2021 and 2022 than in 2019, according to the agency. Moreover, the highest rates were seen in young children ages 1 to 4. Among children in this age group, 461 died by drowning in 2022, an increase of 28% from 2019. | |
| The report also highlighted racial disparities, with higher rates of drowning deaths among individuals identified as non-Hispanic American Indian or Alaska Native and as non-Hispanic Black or African American. Only 45% of all adults reported having had swim lessons, and those numbers were higher among White Americans (52%) than among Black (37%) and Hispanic (28%) Americans. | |
| Racial disparities were also reported in a 2023 analysis from the Consumer Product Safety Commission, which found that African American children made up 21% of all drownings for kids younger than age 15 in which race and ethnicity are known. Among kids ages 5 to 14, 45% of drowning deaths occurred among African Americans. | |
| The CPSC analysis also contained another key data point: The vast majority (80%) of pediatric drownings in which the setting is known occurred at a residence. This means that 4 in 5 kids who drowned died in their own backyard pool or that of a friend, neighbor or family member. Of these residential drownings, 91% occurred among kids younger than 7 years old. | |
| CNN: Why do so many drownings happen in residential settings? | |
| Wen: One major reason is the difference in supervision. Many public beaches and community pools hire lifeguards whose job is to watch out for the safety of everyone in or near the water. Private pools in people’s yards often don’t have someone designated for this purpose. Sometimes older children are supervising younger children but aren’t always watching. Or adults may be supervising, but they are also busy with other tasks. In addition, some of those watching others may not know how to swim themselves. | |
| There may also be a false sense of security in residential settings. People may think the pool is small or not that deep or there are a lot of people around, so what can happen? Keep in mind, though, that small kids can drown in just inches of water. Serious injury or death can happen within 30 seconds. Drownings are often silent because the victim is unable to call out for help. | |
| CNN: How can parents and guardians prevent drownings in residential settings? | |
| Wen: The single most important best practice is to never leave children unsupervised near a body of water. Even if they already know how to swim, even if they are wearing a flotation device, even if the pool is shallow or small, an accident could occur — and either you or another responsible adult should always be able to see your child. The supervising adult should be actively watching the child and not distracted by chores or their smartphone. That person also should not be under the influence of alcohol or drugs. | |
| The adult who’s responsible must also know how to swim well enough so they are able to jump into the pool and save the child if necessary. An additional safety precaution is learning CPR and first aid for infants, children and adults, which you can do through the American Red Cross. | |
| More than 1 in 3 Black adults say they can’t swim. This team is trying to teach them and their kids | |
| If you have a pool, be very careful before allowing others to use it. If your neighbors’ children want to swim in your pool, a responsible adult must accompany them. Private swimming pools should all have childproof fencing around them. The fencing should enclose the pool, have a self-close latch out of the reach of children and be at least 4 feet high. This is required by law in most states. | |
| CNN: What safety precautions should people take around natural bodies of water? | |
| Wen: Always wear a properly fitted, US Coast Guard–approved life jacket when boating. Of all the people who drowned while boating in 2022, 85% were not wearing a life jacket, according to the CDC. | |
| To be safe, swim in areas where a lifeguard is on duty. Always follow lifeguard guidance about safety conditions, and stay in the area designated for swimming. | |
| CNN: What about teaching children how to swim — can that help with water safety? | |
| Wen: Yes. Kids ages 1 to 4 who took part in formal swim lessons had an 88% lower risk of drowning, according to a study in JAMA Pediatrics. The goal here isn’t necessarily to teach kids all the different strokes and get them to join a swim team; it’s to impart basic lifesaving skills, such as treading water and floating on their back. | |
| When you are in the water with your children, take every opportunity to remind them about water safety. Other tips include never swimming alone, always asking for permission before entering the water and never diving into unknown bodies of water headfirst. Young children should also be reminded not to reach for items in the pool, as they are at risk of falling in; they should always ask for help instead. | |
| Never leave children alone by the water, and remind them to ask for help if they want to reach something in the pool. travelism/E+/Getty Images | |
| CNN: What about parents or guardians who don’t know how to swim? Do you recommend that they also take swim lessons? | |
| Wen: Yes. First, adults who don’t know how to swim are more likely to have children who don’t know how to swim. This was the case for me. My parents didn’t swim, and I also never learned swimming growing up. | |
| Second, it’s hard for adults to properly supervise children swimming if they can’t swim themselves. It was actually a terrifying experience with my own children that prompted me to learn to swim. My children were just 1 year and 3 years old one summer when my older kid pushed the younger one into the pool. | |
| We were at our local community pool, and there was a lifeguard who immediately sprang into action. But I remember how terrified I felt — and how helpless. I enrolled my kids in swim lessons right away. I also found an instructor to teach me, too, because I realized I had to overcome my own fear of the water and learn basic water safety skills to protect my kids. | |
| Learning how to swim as an adult is a humbling experience, especially for people like me who had to first start with overcoming fear. I began literally from zero. For weeks, I worked on just getting comfortable submerging my head underwater. | |
| Eventually, I learned how to swim and now really enjoy being in the water. And I feel a lot more comfortable supervising my children when we are in private or community swimming spaces. I’m looking forward to our local pool opening for the summer and to spending time with my kids having a fun — and safe — time in the water. | |
| """ | |
| print(f"Input (Long EN), num chars: {len(long_article_en)}") | |
| summary_long_en_result = engine.summarize(long_article_en, overall_min_length=50, overall_max_length=200) | |
| print(f"Result (Long EN): {summary_long_en_result}") | |
| except Exception as e: | |
| print(f"Failed to initialize or use Summarizer engine: {e}") | |
| import traceback | |
| traceback.print_exc() |