# """ # FilGoalBot Preprocessing Pipeline — v3 (Production) # ===================================================== # Input: data/raw/articles.jsonl # Output: data/processed/chunks.jsonl # data/processed/stats.json # All 9 FilGoal-specific noise patterns handled: # 1. Social share empty links [](https://twitter/facebook...) # 2. Date/byline: 'الأحد، 15 مارس 2026 - 02:08 كتب : FilGoal' # 3. Scoreboard widget: **1**\\ TeamA\\ **1**\\ TeamB\\ **League** # 4. انتهتHH:MM inline score timestamps # 5. Backslash sequences \\\\ (Firecrawl markdown artifact) # 6. News ticker sidebar: 'N دقيقة |' or 'ساعة |' — CUT everything after # 7. Related articles separator '__' — CUT everything after # 8. Markdown bold/italic/headers/links/images # 9. HTML tags, Getty captions, duplicate adjacent phrases # """ # import re, json, logging, argparse # from pathlib import Path # from datetime import datetime # from collections import defaultdict # logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") # log = logging.getLogger("preprocessing") # RAW_FILE = Path("data/raw/articles.jsonl") # OUTPUT_DIR = Path("data/processed") # CHUNKS_FILE = OUTPUT_DIR / "chunks.jsonl" # STATS_FILE = OUTPUT_DIR / "stats.json" # CHUNK_SIZE, CHUNK_OVERLAP = 300, 60 # _AR_DAYS = r'(الأحد|الإثنين|الثلاثاء|الأربعاء|الخميس|الجمعة|السبت)' # _HARAKAT = re.compile(r'[\u064B-\u065F\u0670\u0640]') # def clean_filgoal_body(text: str, title: str = '') -> str: # if not text: # return '' # # 1. Empty social share links # text = re.sub(r'\[\]\(https?://[^\)]+\)\s*', '', text) # # 2. Date + byline header # text = re.sub(_AR_DAYS + r'[،,]\s*\d{1,2}\s+\w+\s+\d{4}\s*[-–]\s*\d{2}:\d{2}\s*', '', text) # text = re.sub(r'كتب\s*:\s*FilGoal\s*', '', text) # # 3. Scoreboard widget # text = re.sub( # r'\*\*\d+\*\*\s*\\\\\s*[^\*\n]{2,40}\s*\\\\\s*' # r'\*\*\d+\*\*\s*\\\\\s*[^\*\n]{2,40}\s*\\\\\s*\*\*[^\*\n]+\*\*', # '', text # ) # # 4. انتهتHH:MM # text = re.sub(r'انتهت\d{2}:\d{2}\s*', '', text) # # 5. All backslashes → space (must happen BEFORE ticker cut) # text = re.sub(r'\\+', ' ', text) # # 6. CUT news ticker: 'N دقيقة |' or 'ساعة |' # ticker = re.search(r'\d+\s+دقيقة\s+\||\bساعة\s+\|', text) # if ticker: # text = text[:ticker.start()].strip() # # 7. CUT at __ related articles separator # dunder = text.find(' __ ') # if dunder > 200: # text = text[:dunder].strip() # # 8. Remove /articles/NNNNN paths # text = re.sub(r'/articles/\d+\S*', '', text) # # 9. Markdown → plain text # text = re.sub(r'\*\*([^\*\n]+)\*\*', r'\1', text) # text = re.sub(r'\*([^\*\n]+)\*', r'\1', text) # text = re.sub(r'#{1,6}\s+', '', text) # text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) # text = re.sub(r'\[\]\([^\)]+\)', '', text) # text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text) # # 10. HTML tags # text = re.sub(r'<[^>]+>', '', text) # # 11. Image captions # text = re.sub(r'صورة\s*:\s*\S+', '', text) # text = re.sub(r'Getty\s*(Images?)?', '', text, flags=re.IGNORECASE) # # 12. Deduplicate repeated adjacent words # text = re.sub(r'\b(\S{4,})\s+\1\b', r'\1', text) # # 13. Final whitespace # text = re.sub(r'[ \t]{2,}', ' ', text).strip() # # 14. Remove title duplicated at start # if title and len(title) > 10 and text.startswith(title[:20].strip()): # text = text[len(title[:20]):].strip() # return text # def normalize_arabic(text: str) -> str: # """Normalize for embedding — do NOT use on display text.""" # if not text: # return '' # text = _HARAKAT.sub('', text) # for frm, to in [('[أإآٱ]', 'ا'), ('ى', 'ي'), ('ة', 'ه'), ('ؤ', 'و')]: # text = re.sub(frm, to, text) # return re.sub(r'\s+', ' ', text).strip() # EGYPTIAN_TEAMS = { # 'الأهلي': 'al_ahly', 'الزمالك': 'zamalek', 'بيراميدز': 'pyramids', # 'الإسماعيلي': 'ismaily', 'المصري': 'masry', 'سيراميكا': 'ceramica', # 'طلائع الجيش': 'tala3a', 'فاركو': 'farco', 'حرس الحدود': 'haras', # 'إنبي': 'enppi', 'المقاولون': 'mokawloon', 'مودرن': 'modern', # 'البنك الأهلي': 'nbe', 'غزل المحلة': 'ghazl', 'سموحة': 'smouha', # 'الجونة': 'el_gouna', # } # LEAGUE_KEYWORDS = { # 'premier_league': ['الدوري الإنجليزي', 'الدوري الإنجليزي الممتاز'], # 'la_liga': ['الدوري الإسباني', 'لاليغا'], # 'serie_a': ['الدوري الإيطالي'], # 'bundesliga': ['الدوري الألماني'], # 'ligue_1': ['الدوري الفرنسي'], # 'champions_league': ['دوري أبطال أوروبا'], # 'caf_champions': ['دوري أبطال إفريقيا', 'الكونفدرالية'], # 'egyptian_league': ['الدوري المصري', 'دوري المحترفين'], # 'saudi_league': ['الدوري السعودي', 'روشن'], # } # def detect_teams(text: str) -> list: # seen, result = set(), [] # for ar, en in EGYPTIAN_TEAMS.items(): # if ar in text and en not in seen: # seen.add(en); result.append(en) # return result # def detect_league(title: str, section: str, body: str) -> str: # combined = f"{title} {section} {body[:300]}" # for lid, kws in LEAGUE_KEYWORDS.items(): # if any(kw in combined for kw in kws): # return lid # SECT_MAP = { # 'الكرة المصرية': 'egyptian_league', 'الدوري المصري': 'egyptian_league', # 'الكرة الإفريقية': 'caf_champions', 'سعودي في الجول': 'saudi_league', # 'الدوري الإنجليزي': 'premier_league','الكرة الأوروبية': 'champions_league', # } # for k, v in SECT_MAP.items(): # if k in section: # return v # return 'other' # def chunk_text(text: str) -> list: # words = text.split() # if len(words) <= CHUNK_SIZE: # return [text] # chunks, start = [], 0 # while start < len(words): # end = min(start + CHUNK_SIZE, len(words)) # chunks.append(' '.join(words[start:end])) # if end == len(words): # break # start += CHUNK_SIZE - CHUNK_OVERLAP # return chunks # TYPE_AR = { # 'lineup': 'تشكيلة', 'match_result': 'نتيجة مباراة', # 'press_conference': 'مؤتمر صحفي', 'training': 'تدريب', # 'transfer': 'ميركاتو', 'article': 'خبر', # } # def build_chunk_text(art: dict, chunk_body: str, idx: int, total: int) -> str: # parts = [f"عنوان: {art.get('title_norm') or art.get('title', '')}"] # t = art.get('article_type', 'article') # if t != 'article': # parts.append(f"نوع: {TYPE_AR.get(t, t)}") # if art.get('section'): # parts.append(f"قسم: {art['section']}") # if art.get('league', 'other') != 'other': # parts.append(f"بطولة: {art['league']}") # if art.get('teams'): # parts.append(f"الفرق: {' - '.join(art['teams'][:3])}") # if art.get('pub_date'): # parts.append(f"تاريخ: {art['pub_date'][:10]}") # prefix = ' | '.join(parts) # suffix = f"[جزء {idx+1}/{total}]\n" if total > 1 else "" # return f"{prefix}\n\n{suffix}{chunk_body}" # def run_pipeline(raw_file: Path = RAW_FILE): # OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # if not Path(raw_file).exists(): # log.error(f"Not found: {raw_file}") # return # articles, seen_ids = [], set() # with open(raw_file, encoding='utf-8') as f: # for line in f: # line = line.strip() # if not line: # continue # try: # art = json.loads(line) # aid = art.get('article_id') # if aid and aid not in seen_ids: # seen_ids.add(aid) # articles.append(art) # except json.JSONDecodeError: # pass # log.info(f"Loaded {len(articles)} articles") # type_c = defaultdict(int) # league_c = defaultdict(int) # total_chunks = skipped = 0 # body_lens = [] # with open(CHUNKS_FILE, 'w', encoding='utf-8') as fout: # for art in articles: # title = art.get('title', '') # bc = clean_filgoal_body(art.get('body', ''), title) # if len(bc) < 80: # skipped += 1 # continue # body_lens.append(len(bc)) # tn = normalize_arabic(title) # bn = normalize_arabic(bc) # teams = detect_teams(title + ' ' + bc) # league = detect_league(title, art.get('section', ''), bc) # enriched = {**art, 'body_clean': bc, 'title_norm': tn, 'teams': teams, 'league': league} # chunks = chunk_text(bn) # n = len(chunks) # for i, cb in enumerate(chunks): # fout.write(json.dumps({ # 'chunk_id': f"{art['article_id']}_{i}", # 'article_id': art['article_id'], # 'chunk_index': i, # 'total_chunks': n, # 'text': build_chunk_text(enriched, cb, i, n), # 'title': title, # 'title_norm': tn, # 'body_clean': bc, # 'section': art.get('section', ''), # 'article_type': art.get('article_type', 'article'), # 'pub_date': art.get('pub_date', ''), # 'teams': teams, # 'league': league, # 'tags': art.get('tags', []), # 'source_url': art.get('source_url', ''), # 'image': art.get('image', ''), # }, ensure_ascii=False) + '\n') # total_chunks += 1 # type_c[art.get('article_type', 'article')] += 1 # league_c[league] += 1 # avg = int(sum(body_lens) / len(body_lens)) if body_lens else 0 # stats = { # 'total_articles': len(articles), # 'articles_processed': len(articles) - skipped, # 'skipped': skipped, # 'total_chunks': total_chunks, # 'avg_body_length': avg, # 'min_body_length': min(body_lens) if body_lens else 0, # 'max_body_length': max(body_lens) if body_lens else 0, # 'article_types': dict(type_c), # 'league_coverage': dict(league_c), # 'processed_at': datetime.now().isoformat(), # } # STATS_FILE.write_text(json.dumps(stats, ensure_ascii=False, indent=2)) # log.info(f"\n Preprocessing complete!") # log.info(f" Articles : {len(articles) - skipped} / {len(articles)}") # log.info(f" Chunks : {total_chunks}") # log.info(f" Avg body : {avg} chars") # log.info(f" Types : {dict(type_c)}") # log.info(f" Leagues : {dict(league_c)}") # log.info(f" Output : {CHUNKS_FILE}") # return stats # if __name__ == '__main__': # parser = argparse.ArgumentParser(description='FilGoalBot Preprocessing Pipeline') # parser.add_argument('--input', default=str(RAW_FILE), help='Path to raw articles.jsonl') # args = parser.parse_args() # run_pipeline(Path(args.input)) """ FilGoalBot Preprocessing Pipeline — v4 (Production) ===================================================== Input: data/raw/articles.jsonl Output: data/processed/chunks.jsonl data/processed/stats.json All 9 FilGoal-specific noise patterns handled: 1. Social share empty links [](https://twitter/facebook...) 2. Date/byline: 'الأحد، 15 مارس 2026 - 02:08 كتب : FilGoal' 3. Scoreboard widget: **1**\\ TeamA\\ **1**\\ TeamB\\ **League** 4. انتهتHH:MM inline score timestamps 5. Backslash sequences \\\\ (Firecrawl markdown artifact) 6. News ticker sidebar: 'N دقيقة |' or 'ساعة |' — CUT everything after 7. Related articles separator '__' — CUT everything after 8. Markdown bold/italic/headers/links/images 9. HTML tags, Getty captions, duplicate adjacent phrases v4 changes: - Skip non-football sections: كرة يد / كرة سلة / كرة طائرة - video:1 placeholders removed - Embedded tweet pic links pic.twitter.com stripped - Hashtags stripped (including escaped \\_) - Angle-bracket tweet separator > replaced with space - FilGoal domain refs filgoal.com/... stripped - Client JS artifact at tail stripped - HaytersTV promo paragraph stripped - English tweet dates (March 14, 2026) stripped - YouTube label + iframe block stripped - beIN Sports handles @beINSPORTS stripped - Mixed bold-italic _**text**_ stripped - Emojis removed - Tashkeel/tatweel diacritics removed (normalization) - Alef variants أ إ آ → ا (normalization) """ import re, json, logging, argparse from pathlib import Path from datetime import datetime from collections import defaultdict logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") log = logging.getLogger("preprocessing") RAW_FILE = Path("data/raw/articles.jsonl") OUTPUT_DIR = Path("data/processed") CHUNKS_FILE = OUTPUT_DIR / "chunks.jsonl" STATS_FILE = OUTPUT_DIR / "stats.json" CHUNK_SIZE, CHUNK_OVERLAP = 300, 60 # Sections to skip entirely — not football content NON_FOOTBALL_SECTIONS = { 'كرة يد', 'كرة سلة', 'كرة طائرة', 'رياضات أخرى', } _AR_DAYS = r'(الأحد|الإثنين|الثلاثاء|الأربعاء|الخميس|الجمعة|السبت)' _HARAKAT = re.compile(r'[\u064B-\u065F\u0670\u0640]') _EMOJI = re.compile( r'[\U0001F300-\U0001F9FF' r'\U00002600-\U000027FF' r'\U0000FE00-\U0000FE0F' r'\U0001FA00-\U0001FA9F]+', flags=re.UNICODE, ) _ENGLISH_MONTHS = ( r'(January|February|March|April|May|June|' r'July|August|September|October|November|December)' ) def clean_filgoal_body(text: str, title: str = '') -> str: if not text: return '' # ── Phase A: pre-backslash patterns ────────────────────────────────────── # 1. Empty social share links text = re.sub(r'\[\]\(https?://[^\)]+\)\s*', '', text) # 2. Date + byline header text = re.sub( _AR_DAYS + r'[،,]\s*\d{1,2}\s+\w+\s+\d{4}\s*[-–]\s*\d{2}:\d{2}\s*', '', text, ) text = re.sub(r'كتب\s*:\s*FilGoal\s*', '', text) # 3. Scoreboard widget **1**\\ TeamA\\ **1**\\ TeamB\\ **League** text = re.sub( r'\*\*\d+\*\*\s*\\\\\s*[^\*\n]{2,40}\s*\\\\\s*' r'\*\*\d+\*\*\s*\\\\\s*[^\*\n]{2,40}\s*\\\\\s*\*\*[^\*\n]+\*\*', '', text, ) # 4. انتهتHH:MM text = re.sub(r'انتهت\d{2}:\d{2}\s*', '', text) # 5. video:N placeholders text = re.sub(r'\bvideo:\d+\b', '', text) # 6. Embedded tweet pic links text = re.sub(r'https?://pic\.twitter\.com/\S+', '', text) text = re.sub(r'pic\.twitter\.com/\S+', '', text) # 7. Hashtags (plain and escaped-underscore forms) text = re.sub(r'#[\w\u0600-\u06FF]+(?:\\_[\w\u0600-\u06FF]+)*', '', text) # 8. Angle-bracket tweet separator lines (">") text = re.sub(r'(?m)^>\s*', ' ', text) # 9. FilGoal domain references — three variants text = re.sub(r'https?://(?:www\.)?filgoal\.com/\S*', '', text) text = re.sub(r'(?:www\.)?filgoal\.com/\S*', '', text) text = re.sub(r'\bfilgoal\.com\b', '', text) # ── Phase B: hard cuts ──────────────────────────────────────────────────── # 10. All backslashes → space (must happen BEFORE ticker/related cuts) text = re.sub(r'\\+', ' ', text) # 11. CUT news ticker: 'N دقيقة |' or 'ساعة |' ticker = re.search(r'\d+\s+دقيقة\s+\||\bساعة\s+\|', text) if ticker: text = text[:ticker.start()].strip() # 12. CUT at __ related articles separator dunder = text.find(' __ ') if dunder > 200: text = text[:dunder].strip() # ── Phase C: post-cut cleanup ───────────────────────────────────────────── # 13. Remove /articles/NNNNN paths text = re.sub(r'/articles/\d+\S*', '', text) # 14. Strip "Client" JS artifact at tail text = re.sub(r'\bClient\b.*$', '', text, flags=re.DOTALL) # 15. HaytersTV promo paragraph text = re.sub( r'هايترز\s*تي\s*في.*?(?:يوتيوب|YouTube|اشترك)[^\n]*', '', text, flags=re.IGNORECASE | re.DOTALL, ) # 16. YouTube label + iframe block text = re.sub(r'\bYouTube\b[^\n]*', '', text, flags=re.IGNORECASE) text = re.sub( r'\d[\d,\.]*\s*(?:مشترك|subscriber)[^\n]*', '', text, flags=re.IGNORECASE, ) # 17. beIN Sports handles and attributions text = re.sub(r'@beIN\w*', '', text, flags=re.IGNORECASE) text = re.sub(r'beIN\s*Sports?\s*(?:عربي|العربية|Arabic)?', '', text, flags=re.IGNORECASE) # 18. English tweet dates "March 14, 2026" / "14 March 2026" text = re.sub( r'\b\d{1,2}\s+' + _ENGLISH_MONTHS + r'\s+\d{4}\b', '', text, ) text = re.sub( _ENGLISH_MONTHS + r'\s+\d{1,2},?\s+\d{4}\b', '', text, ) # 19. Mixed bold-italic _**text**_ or **_text_** text = re.sub(r'_\*\*([^\*\n]+)\*\*_', r'\1', text) text = re.sub(r'\*\*_([^_\n]+)_\*\*', r'\1', text) # 20. Emojis text = _EMOJI.sub('', text) # ── Phase D: standard markdown → plain text ─────────────────────────────── text = re.sub(r'\*\*([^\*\n]+)\*\*', r'\1', text) text = re.sub(r'\*([^\*\n]+)\*', r'\1', text) text = re.sub(r'#{1,6}\s+', '', text) text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) text = re.sub(r'\[\]\([^\)]+\)', '', text) text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text) # ── Phase E: HTML + captions ────────────────────────────────────────────── text = re.sub(r'<[^>]+>', '', text) text = re.sub(r'صورة\s*:\s*\S+', '', text) text = re.sub(r'Getty\s*(Images?)?', '', text, flags=re.IGNORECASE) # ── Phase F: final normalisation ────────────────────────────────────────── # Deduplicate repeated adjacent words text = re.sub(r'\b(\S{4,})\s+\1\b', r'\1', text) # Collapse whitespace text = re.sub(r'[ \t]{2,}', ' ', text).strip() # Remove title duplicated at start if title and len(title) > 10 and text.startswith(title[:20].strip()): text = text[len(title[:20]):].strip() return text def normalize_arabic(text: str) -> str: """Normalize for embedding — do NOT use on display text.""" if not text: return '' text = _HARAKAT.sub('', text) for frm, to in [ ('[أإآٱ]', 'ا'), ('ى', 'ي'), ('ة', 'ه'), ('ؤ', 'و'), ]: text = re.sub(frm, to, text) return re.sub(r'\s+', ' ', text).strip() EGYPTIAN_TEAMS = { 'الأهلي': 'al_ahly', 'الزمالك': 'zamalek', 'بيراميدز': 'pyramids', 'الإسماعيلي': 'ismaily', 'المصري': 'masry', 'سيراميكا': 'ceramica', 'طلائع الجيش': 'tala3a', 'فاركو': 'farco', 'حرس الحدود': 'haras', 'إنبي': 'enppi', 'المقاولون': 'mokawloon', 'مودرن': 'modern', 'البنك الأهلي': 'nbe', 'غزل المحلة': 'ghazl', 'سموحة': 'smouha', 'الجونة': 'el_gouna', } LEAGUE_KEYWORDS = { 'premier_league': ['الدوري الإنجليزي', 'الدوري الإنجليزي الممتاز'], 'la_liga': ['الدوري الإسباني', 'لاليغا'], 'serie_a': ['الدوري الإيطالي'], 'bundesliga': ['الدوري الألماني'], 'ligue_1': ['الدوري الفرنسي'], 'champions_league': ['دوري أبطال أوروبا'], 'caf_champions': ['دوري أبطال إفريقيا', 'الكونفدرالية'], 'egyptian_league': ['الدوري المصري', 'دوري المحترفين'], 'saudi_league': ['الدوري السعودي', 'روشن'], } def detect_teams(text: str) -> list: seen, result = set(), [] for ar, en in EGYPTIAN_TEAMS.items(): if ar in text and en not in seen: seen.add(en) result.append(en) return result def detect_league(title: str, section: str, body: str) -> str: combined = f"{title} {section} {body[:300]}" for lid, kws in LEAGUE_KEYWORDS.items(): if any(kw in combined for kw in kws): return lid SECT_MAP = { 'الكرة المصرية': 'egyptian_league', 'الدوري المصري': 'egyptian_league', 'الكرة الإفريقية': 'caf_champions', 'سعودي في الجول': 'saudi_league', 'الدوري الإنجليزي': 'premier_league', 'الكرة الأوروبية': 'champions_league', } for k, v in SECT_MAP.items(): if k in section: return v return 'other' def chunk_text(text: str) -> list: words = text.split() if len(words) <= CHUNK_SIZE: return [text] chunks, start = [], 0 while start < len(words): end = min(start + CHUNK_SIZE, len(words)) chunks.append(' '.join(words[start:end])) if end == len(words): break start += CHUNK_SIZE - CHUNK_OVERLAP return chunks TYPE_AR = { 'lineup': 'تشكيلة', 'match_result': 'نتيجة مباراة', 'press_conference': 'مؤتمر صحفي', 'training': 'تدريب', 'transfer': 'ميركاتو', 'article': 'خبر', } def build_chunk_text(art: dict, chunk_body: str, idx: int, total: int) -> str: parts = [f"عنوان: {art.get('title_norm') or art.get('title', '')}"] t = art.get('article_type', 'article') if t != 'article': parts.append(f"نوع: {TYPE_AR.get(t, t)}") if art.get('section'): parts.append(f"قسم: {art['section']}") if art.get('league', 'other') != 'other': parts.append(f"بطولة: {art['league']}") if art.get('teams'): parts.append(f"الفرق: {' - '.join(art['teams'][:3])}") if art.get('pub_date'): parts.append(f"تاريخ: {art['pub_date'][:10]}") prefix = ' | '.join(parts) suffix = f"[جزء {idx+1}/{total}]\n" if total > 1 else "" return f"{prefix}\n\n{suffix}{chunk_body}" def run_pipeline(raw_file: Path = RAW_FILE): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) if not Path(raw_file).exists(): log.error(f"Not found: {raw_file}") return articles, seen_ids = [], set() with open(raw_file, encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue try: art = json.loads(line) aid = art.get('article_id') if aid and aid not in seen_ids: seen_ids.add(aid) articles.append(art) except json.JSONDecodeError: pass log.info(f"Loaded {len(articles)} articles") type_c = defaultdict(int) league_c = defaultdict(int) total_chunks = skipped = skipped_section = 0 body_lens = [] with open(CHUNKS_FILE, 'w', encoding='utf-8') as fout: for art in articles: # ── Filter: skip non-football sections ─────────────────────────── section = art.get('section', '').strip() if section in NON_FOOTBALL_SECTIONS: skipped_section += 1 log.debug(f"Skipped non-football section '{section}': {art.get('article_id')}") continue title = art.get('title', '') bc = clean_filgoal_body(art.get('body', ''), title) if len(bc) < 80: skipped += 1 continue body_lens.append(len(bc)) tn = normalize_arabic(title) bn = normalize_arabic(bc) teams = detect_teams(title + ' ' + bc) league = detect_league(title, section, bc) enriched = { **art, 'body_clean': bc, 'title_norm': tn, 'teams': teams, 'league': league, } chunks = chunk_text(bn) n = len(chunks) for i, cb in enumerate(chunks): fout.write(json.dumps({ 'chunk_id': f"{art['article_id']}_{i}", 'article_id': art['article_id'], 'chunk_index': i, 'total_chunks': n, 'text': build_chunk_text(enriched, cb, i, n), 'title': title, 'title_norm': tn, 'body_clean': bc, 'section': section, 'article_type': art.get('article_type', 'article'), 'pub_date': art.get('pub_date', ''), 'teams': teams, 'league': league, 'tags': art.get('tags', []), 'source_url': art.get('source_url', ''), 'image': art.get('image', ''), }, ensure_ascii=False) + '\n') total_chunks += 1 type_c[art.get('article_type', 'article')] += 1 league_c[league] += 1 processed = len(articles) - skipped - skipped_section avg = int(sum(body_lens) / len(body_lens)) if body_lens else 0 stats = { 'total_articles': len(articles), 'articles_processed': processed, 'skipped_short_body': skipped, 'skipped_non_football': skipped_section, 'total_chunks': total_chunks, 'avg_body_length': avg, 'min_body_length': min(body_lens) if body_lens else 0, 'max_body_length': max(body_lens) if body_lens else 0, 'article_types': dict(type_c), 'league_coverage': dict(league_c), 'processed_at': datetime.now().isoformat(), } STATS_FILE.write_text(json.dumps(stats, ensure_ascii=False, indent=2)) log.info(f"\n Preprocessing complete!") log.info(f" Articles total : {len(articles)}") log.info(f" Processed : {processed}") log.info(f" Skipped (short body): {skipped}") log.info(f" Skipped (non-football sections): {skipped_section} " f"{sorted(NON_FOOTBALL_SECTIONS)}") log.info(f" Chunks : {total_chunks}") log.info(f" Avg body : {avg} chars") log.info(f" Types : {dict(type_c)}") log.info(f" Leagues : {dict(league_c)}") log.info(f" Output : {CHUNKS_FILE}") return stats if __name__ == '__main__': parser = argparse.ArgumentParser(description='FilGoalBot Preprocessing Pipeline v4') parser.add_argument('--input', default=str(RAW_FILE), help='Path to raw articles.jsonl') args = parser.parse_args() run_pipeline(Path(args.input))