Fil-RAG-Goal / preprocessing /pipeline.py
Omar10lfc's picture
Initial Space deployment
18fd039
# """
# FilGoalBot Preprocessing Pipeline — v3 (Production)
# =====================================================
# Input: data/raw/articles.jsonl
# Output: data/processed/chunks.jsonl
# data/processed/stats.json
# All 9 FilGoal-specific noise patterns handled:
# 1. Social share empty links [](https://twitter/facebook...)
# 2. Date/byline: 'الأحد، 15 مارس 2026 - 02:08 كتب : FilGoal'
# 3. Scoreboard widget: **1**\\ TeamA\\ **1**\\ TeamB\\ **League**
# 4. انتهتHH:MM inline score timestamps
# 5. Backslash sequences \\\\ (Firecrawl markdown artifact)
# 6. News ticker sidebar: 'N دقيقة |' or 'ساعة |' — CUT everything after
# 7. Related articles separator '__' — CUT everything after
# 8. Markdown bold/italic/headers/links/images
# 9. HTML tags, Getty captions, duplicate adjacent phrases
# """
# import re, json, logging, argparse
# from pathlib import Path
# from datetime import datetime
# from collections import defaultdict
# logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
# log = logging.getLogger("preprocessing")
# RAW_FILE = Path("data/raw/articles.jsonl")
# OUTPUT_DIR = Path("data/processed")
# CHUNKS_FILE = OUTPUT_DIR / "chunks.jsonl"
# STATS_FILE = OUTPUT_DIR / "stats.json"
# CHUNK_SIZE, CHUNK_OVERLAP = 300, 60
# _AR_DAYS = r'(الأحد|الإثنين|الثلاثاء|الأربعاء|الخميس|الجمعة|السبت)'
# _HARAKAT = re.compile(r'[\u064B-\u065F\u0670\u0640]')
# def clean_filgoal_body(text: str, title: str = '') -> str:
# if not text:
# return ''
# # 1. Empty social share links
# text = re.sub(r'\[\]\(https?://[^\)]+\)\s*', '', text)
# # 2. Date + byline header
# text = re.sub(_AR_DAYS + r'[،,]\s*\d{1,2}\s+\w+\s+\d{4}\s*[-–]\s*\d{2}:\d{2}\s*', '', text)
# text = re.sub(r'كتب\s*:\s*FilGoal\s*', '', text)
# # 3. Scoreboard widget
# text = re.sub(
# r'\*\*\d+\*\*\s*\\\\\s*[^\*\n]{2,40}\s*\\\\\s*'
# r'\*\*\d+\*\*\s*\\\\\s*[^\*\n]{2,40}\s*\\\\\s*\*\*[^\*\n]+\*\*',
# '', text
# )
# # 4. انتهتHH:MM
# text = re.sub(r'انتهت\d{2}:\d{2}\s*', '', text)
# # 5. All backslashes → space (must happen BEFORE ticker cut)
# text = re.sub(r'\\+', ' ', text)
# # 6. CUT news ticker: 'N دقيقة |' or 'ساعة |'
# ticker = re.search(r'\d+\s+دقيقة\s+\||\bساعة\s+\|', text)
# if ticker:
# text = text[:ticker.start()].strip()
# # 7. CUT at __ related articles separator
# dunder = text.find(' __ ')
# if dunder > 200:
# text = text[:dunder].strip()
# # 8. Remove /articles/NNNNN paths
# text = re.sub(r'/articles/\d+\S*', '', text)
# # 9. Markdown → plain text
# text = re.sub(r'\*\*([^\*\n]+)\*\*', r'\1', text)
# text = re.sub(r'\*([^\*\n]+)\*', r'\1', text)
# text = re.sub(r'#{1,6}\s+', '', text)
# text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
# text = re.sub(r'\[\]\([^\)]+\)', '', text)
# text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
# # 10. HTML tags
# text = re.sub(r'<[^>]+>', '', text)
# # 11. Image captions
# text = re.sub(r'صورة\s*:\s*\S+', '', text)
# text = re.sub(r'Getty\s*(Images?)?', '', text, flags=re.IGNORECASE)
# # 12. Deduplicate repeated adjacent words
# text = re.sub(r'\b(\S{4,})\s+\1\b', r'\1', text)
# # 13. Final whitespace
# text = re.sub(r'[ \t]{2,}', ' ', text).strip()
# # 14. Remove title duplicated at start
# if title and len(title) > 10 and text.startswith(title[:20].strip()):
# text = text[len(title[:20]):].strip()
# return text
# def normalize_arabic(text: str) -> str:
# """Normalize for embedding — do NOT use on display text."""
# if not text:
# return ''
# text = _HARAKAT.sub('', text)
# for frm, to in [('[أإآٱ]', 'ا'), ('ى', 'ي'), ('ة', 'ه'), ('ؤ', 'و')]:
# text = re.sub(frm, to, text)
# return re.sub(r'\s+', ' ', text).strip()
# EGYPTIAN_TEAMS = {
# 'الأهلي': 'al_ahly', 'الزمالك': 'zamalek', 'بيراميدز': 'pyramids',
# 'الإسماعيلي': 'ismaily', 'المصري': 'masry', 'سيراميكا': 'ceramica',
# 'طلائع الجيش': 'tala3a', 'فاركو': 'farco', 'حرس الحدود': 'haras',
# 'إنبي': 'enppi', 'المقاولون': 'mokawloon', 'مودرن': 'modern',
# 'البنك الأهلي': 'nbe', 'غزل المحلة': 'ghazl', 'سموحة': 'smouha',
# 'الجونة': 'el_gouna',
# }
# LEAGUE_KEYWORDS = {
# 'premier_league': ['الدوري الإنجليزي', 'الدوري الإنجليزي الممتاز'],
# 'la_liga': ['الدوري الإسباني', 'لاليغا'],
# 'serie_a': ['الدوري الإيطالي'],
# 'bundesliga': ['الدوري الألماني'],
# 'ligue_1': ['الدوري الفرنسي'],
# 'champions_league': ['دوري أبطال أوروبا'],
# 'caf_champions': ['دوري أبطال إفريقيا', 'الكونفدرالية'],
# 'egyptian_league': ['الدوري المصري', 'دوري المحترفين'],
# 'saudi_league': ['الدوري السعودي', 'روشن'],
# }
# def detect_teams(text: str) -> list:
# seen, result = set(), []
# for ar, en in EGYPTIAN_TEAMS.items():
# if ar in text and en not in seen:
# seen.add(en); result.append(en)
# return result
# def detect_league(title: str, section: str, body: str) -> str:
# combined = f"{title} {section} {body[:300]}"
# for lid, kws in LEAGUE_KEYWORDS.items():
# if any(kw in combined for kw in kws):
# return lid
# SECT_MAP = {
# 'الكرة المصرية': 'egyptian_league', 'الدوري المصري': 'egyptian_league',
# 'الكرة الإفريقية': 'caf_champions', 'سعودي في الجول': 'saudi_league',
# 'الدوري الإنجليزي': 'premier_league','الكرة الأوروبية': 'champions_league',
# }
# for k, v in SECT_MAP.items():
# if k in section:
# return v
# return 'other'
# def chunk_text(text: str) -> list:
# words = text.split()
# if len(words) <= CHUNK_SIZE:
# return [text]
# chunks, start = [], 0
# while start < len(words):
# end = min(start + CHUNK_SIZE, len(words))
# chunks.append(' '.join(words[start:end]))
# if end == len(words):
# break
# start += CHUNK_SIZE - CHUNK_OVERLAP
# return chunks
# TYPE_AR = {
# 'lineup': 'تشكيلة', 'match_result': 'نتيجة مباراة',
# 'press_conference': 'مؤتمر صحفي', 'training': 'تدريب',
# 'transfer': 'ميركاتو', 'article': 'خبر',
# }
# def build_chunk_text(art: dict, chunk_body: str, idx: int, total: int) -> str:
# parts = [f"عنوان: {art.get('title_norm') or art.get('title', '')}"]
# t = art.get('article_type', 'article')
# if t != 'article':
# parts.append(f"نوع: {TYPE_AR.get(t, t)}")
# if art.get('section'):
# parts.append(f"قسم: {art['section']}")
# if art.get('league', 'other') != 'other':
# parts.append(f"بطولة: {art['league']}")
# if art.get('teams'):
# parts.append(f"الفرق: {' - '.join(art['teams'][:3])}")
# if art.get('pub_date'):
# parts.append(f"تاريخ: {art['pub_date'][:10]}")
# prefix = ' | '.join(parts)
# suffix = f"[جزء {idx+1}/{total}]\n" if total > 1 else ""
# return f"{prefix}\n\n{suffix}{chunk_body}"
# def run_pipeline(raw_file: Path = RAW_FILE):
# OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# if not Path(raw_file).exists():
# log.error(f"Not found: {raw_file}")
# return
# articles, seen_ids = [], set()
# with open(raw_file, encoding='utf-8') as f:
# for line in f:
# line = line.strip()
# if not line:
# continue
# try:
# art = json.loads(line)
# aid = art.get('article_id')
# if aid and aid not in seen_ids:
# seen_ids.add(aid)
# articles.append(art)
# except json.JSONDecodeError:
# pass
# log.info(f"Loaded {len(articles)} articles")
# type_c = defaultdict(int)
# league_c = defaultdict(int)
# total_chunks = skipped = 0
# body_lens = []
# with open(CHUNKS_FILE, 'w', encoding='utf-8') as fout:
# for art in articles:
# title = art.get('title', '')
# bc = clean_filgoal_body(art.get('body', ''), title)
# if len(bc) < 80:
# skipped += 1
# continue
# body_lens.append(len(bc))
# tn = normalize_arabic(title)
# bn = normalize_arabic(bc)
# teams = detect_teams(title + ' ' + bc)
# league = detect_league(title, art.get('section', ''), bc)
# enriched = {**art, 'body_clean': bc, 'title_norm': tn, 'teams': teams, 'league': league}
# chunks = chunk_text(bn)
# n = len(chunks)
# for i, cb in enumerate(chunks):
# fout.write(json.dumps({
# 'chunk_id': f"{art['article_id']}_{i}",
# 'article_id': art['article_id'],
# 'chunk_index': i,
# 'total_chunks': n,
# 'text': build_chunk_text(enriched, cb, i, n),
# 'title': title,
# 'title_norm': tn,
# 'body_clean': bc,
# 'section': art.get('section', ''),
# 'article_type': art.get('article_type', 'article'),
# 'pub_date': art.get('pub_date', ''),
# 'teams': teams,
# 'league': league,
# 'tags': art.get('tags', []),
# 'source_url': art.get('source_url', ''),
# 'image': art.get('image', ''),
# }, ensure_ascii=False) + '\n')
# total_chunks += 1
# type_c[art.get('article_type', 'article')] += 1
# league_c[league] += 1
# avg = int(sum(body_lens) / len(body_lens)) if body_lens else 0
# stats = {
# 'total_articles': len(articles),
# 'articles_processed': len(articles) - skipped,
# 'skipped': skipped,
# 'total_chunks': total_chunks,
# 'avg_body_length': avg,
# 'min_body_length': min(body_lens) if body_lens else 0,
# 'max_body_length': max(body_lens) if body_lens else 0,
# 'article_types': dict(type_c),
# 'league_coverage': dict(league_c),
# 'processed_at': datetime.now().isoformat(),
# }
# STATS_FILE.write_text(json.dumps(stats, ensure_ascii=False, indent=2))
# log.info(f"\n Preprocessing complete!")
# log.info(f" Articles : {len(articles) - skipped} / {len(articles)}")
# log.info(f" Chunks : {total_chunks}")
# log.info(f" Avg body : {avg} chars")
# log.info(f" Types : {dict(type_c)}")
# log.info(f" Leagues : {dict(league_c)}")
# log.info(f" Output : {CHUNKS_FILE}")
# return stats
# if __name__ == '__main__':
# parser = argparse.ArgumentParser(description='FilGoalBot Preprocessing Pipeline')
# parser.add_argument('--input', default=str(RAW_FILE), help='Path to raw articles.jsonl')
# args = parser.parse_args()
# run_pipeline(Path(args.input))
"""
FilGoalBot Preprocessing Pipeline — v4 (Production)
=====================================================
Input: data/raw/articles.jsonl
Output: data/processed/chunks.jsonl
data/processed/stats.json
All 9 FilGoal-specific noise patterns handled:
1. Social share empty links [](https://twitter/facebook...)
2. Date/byline: 'الأحد، 15 مارس 2026 - 02:08 كتب : FilGoal'
3. Scoreboard widget: **1**\\ TeamA\\ **1**\\ TeamB\\ **League**
4. انتهتHH:MM inline score timestamps
5. Backslash sequences \\\\ (Firecrawl markdown artifact)
6. News ticker sidebar: 'N دقيقة |' or 'ساعة |' — CUT everything after
7. Related articles separator '__' — CUT everything after
8. Markdown bold/italic/headers/links/images
9. HTML tags, Getty captions, duplicate adjacent phrases
v4 changes:
- Skip non-football sections: كرة يد / كرة سلة / كرة طائرة
- video:1 placeholders removed
- Embedded tweet pic links pic.twitter.com stripped
- Hashtags stripped (including escaped \\_)
- Angle-bracket tweet separator > replaced with space
- FilGoal domain refs filgoal.com/... stripped
- Client JS artifact at tail stripped
- HaytersTV promo paragraph stripped
- English tweet dates (March 14, 2026) stripped
- YouTube label + iframe block stripped
- beIN Sports handles @beINSPORTS stripped
- Mixed bold-italic _**text**_ stripped
- Emojis removed
- Tashkeel/tatweel diacritics removed (normalization)
- Alef variants أ إ آ → ا (normalization)
"""
import re, json, logging, argparse
from pathlib import Path
from datetime import datetime
from collections import defaultdict
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
log = logging.getLogger("preprocessing")
RAW_FILE = Path("data/raw/articles.jsonl")
OUTPUT_DIR = Path("data/processed")
CHUNKS_FILE = OUTPUT_DIR / "chunks.jsonl"
STATS_FILE = OUTPUT_DIR / "stats.json"
CHUNK_SIZE, CHUNK_OVERLAP = 300, 60
# Sections to skip entirely — not football content
NON_FOOTBALL_SECTIONS = {
'كرة يد',
'كرة سلة',
'كرة طائرة',
'رياضات أخرى',
}
_AR_DAYS = r'(الأحد|الإثنين|الثلاثاء|الأربعاء|الخميس|الجمعة|السبت)'
_HARAKAT = re.compile(r'[\u064B-\u065F\u0670\u0640]')
_EMOJI = re.compile(
r'[\U0001F300-\U0001F9FF'
r'\U00002600-\U000027FF'
r'\U0000FE00-\U0000FE0F'
r'\U0001FA00-\U0001FA9F]+',
flags=re.UNICODE,
)
_ENGLISH_MONTHS = (
r'(January|February|March|April|May|June|'
r'July|August|September|October|November|December)'
)
def clean_filgoal_body(text: str, title: str = '') -> str:
if not text:
return ''
# ── Phase A: pre-backslash patterns ──────────────────────────────────────
# 1. Empty social share links
text = re.sub(r'\[\]\(https?://[^\)]+\)\s*', '', text)
# 2. Date + byline header
text = re.sub(
_AR_DAYS + r'[،,]\s*\d{1,2}\s+\w+\s+\d{4}\s*[-–]\s*\d{2}:\d{2}\s*',
'', text,
)
text = re.sub(r'كتب\s*:\s*FilGoal\s*', '', text)
# 3. Scoreboard widget **1**\\ TeamA\\ **1**\\ TeamB\\ **League**
text = re.sub(
r'\*\*\d+\*\*\s*\\\\\s*[^\*\n]{2,40}\s*\\\\\s*'
r'\*\*\d+\*\*\s*\\\\\s*[^\*\n]{2,40}\s*\\\\\s*\*\*[^\*\n]+\*\*',
'', text,
)
# 4. انتهتHH:MM
text = re.sub(r'انتهت\d{2}:\d{2}\s*', '', text)
# 5. video:N placeholders
text = re.sub(r'\bvideo:\d+\b', '', text)
# 6. Embedded tweet pic links
text = re.sub(r'https?://pic\.twitter\.com/\S+', '', text)
text = re.sub(r'pic\.twitter\.com/\S+', '', text)
# 7. Hashtags (plain and escaped-underscore forms)
text = re.sub(r'#[\w\u0600-\u06FF]+(?:\\_[\w\u0600-\u06FF]+)*', '', text)
# 8. Angle-bracket tweet separator lines (">")
text = re.sub(r'(?m)^>\s*', ' ', text)
# 9. FilGoal domain references — three variants
text = re.sub(r'https?://(?:www\.)?filgoal\.com/\S*', '', text)
text = re.sub(r'(?:www\.)?filgoal\.com/\S*', '', text)
text = re.sub(r'\bfilgoal\.com\b', '', text)
# ── Phase B: hard cuts ────────────────────────────────────────────────────
# 10. All backslashes → space (must happen BEFORE ticker/related cuts)
text = re.sub(r'\\+', ' ', text)
# 11. CUT news ticker: 'N دقيقة |' or 'ساعة |'
ticker = re.search(r'\d+\s+دقيقة\s+\||\bساعة\s+\|', text)
if ticker:
text = text[:ticker.start()].strip()
# 12. CUT at __ related articles separator
dunder = text.find(' __ ')
if dunder > 200:
text = text[:dunder].strip()
# ── Phase C: post-cut cleanup ─────────────────────────────────────────────
# 13. Remove /articles/NNNNN paths
text = re.sub(r'/articles/\d+\S*', '', text)
# 14. Strip "Client" JS artifact at tail
text = re.sub(r'\bClient\b.*$', '', text, flags=re.DOTALL)
# 15. HaytersTV promo paragraph
text = re.sub(
r'هايترز\s*تي\s*في.*?(?:يوتيوب|YouTube|اشترك)[^\n]*',
'', text, flags=re.IGNORECASE | re.DOTALL,
)
# 16. YouTube label + iframe block
text = re.sub(r'\bYouTube\b[^\n]*', '', text, flags=re.IGNORECASE)
text = re.sub(
r'\d[\d,\.]*\s*(?:مشترك|subscriber)[^\n]*',
'', text, flags=re.IGNORECASE,
)
# 17. beIN Sports handles and attributions
text = re.sub(r'@beIN\w*', '', text, flags=re.IGNORECASE)
text = re.sub(r'beIN\s*Sports?\s*(?:عربي|العربية|Arabic)?', '', text, flags=re.IGNORECASE)
# 18. English tweet dates "March 14, 2026" / "14 March 2026"
text = re.sub(
r'\b\d{1,2}\s+' + _ENGLISH_MONTHS + r'\s+\d{4}\b', '', text,
)
text = re.sub(
_ENGLISH_MONTHS + r'\s+\d{1,2},?\s+\d{4}\b', '', text,
)
# 19. Mixed bold-italic _**text**_ or **_text_**
text = re.sub(r'_\*\*([^\*\n]+)\*\*_', r'\1', text)
text = re.sub(r'\*\*_([^_\n]+)_\*\*', r'\1', text)
# 20. Emojis
text = _EMOJI.sub('', text)
# ── Phase D: standard markdown → plain text ───────────────────────────────
text = re.sub(r'\*\*([^\*\n]+)\*\*', r'\1', text)
text = re.sub(r'\*([^\*\n]+)\*', r'\1', text)
text = re.sub(r'#{1,6}\s+', '', text)
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
text = re.sub(r'\[\]\([^\)]+\)', '', text)
text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
# ── Phase E: HTML + captions ──────────────────────────────────────────────
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'صورة\s*:\s*\S+', '', text)
text = re.sub(r'Getty\s*(Images?)?', '', text, flags=re.IGNORECASE)
# ── Phase F: final normalisation ──────────────────────────────────────────
# Deduplicate repeated adjacent words
text = re.sub(r'\b(\S{4,})\s+\1\b', r'\1', text)
# Collapse whitespace
text = re.sub(r'[ \t]{2,}', ' ', text).strip()
# Remove title duplicated at start
if title and len(title) > 10 and text.startswith(title[:20].strip()):
text = text[len(title[:20]):].strip()
return text
def normalize_arabic(text: str) -> str:
"""Normalize for embedding — do NOT use on display text."""
if not text:
return ''
text = _HARAKAT.sub('', text)
for frm, to in [
('[أإآٱ]', 'ا'),
('ى', 'ي'),
('ة', 'ه'),
('ؤ', 'و'),
]:
text = re.sub(frm, to, text)
return re.sub(r'\s+', ' ', text).strip()
EGYPTIAN_TEAMS = {
'الأهلي': 'al_ahly',
'الزمالك': 'zamalek',
'بيراميدز': 'pyramids',
'الإسماعيلي': 'ismaily',
'المصري': 'masry',
'سيراميكا': 'ceramica',
'طلائع الجيش': 'tala3a',
'فاركو': 'farco',
'حرس الحدود': 'haras',
'إنبي': 'enppi',
'المقاولون': 'mokawloon',
'مودرن': 'modern',
'البنك الأهلي': 'nbe',
'غزل المحلة': 'ghazl',
'سموحة': 'smouha',
'الجونة': 'el_gouna',
}
LEAGUE_KEYWORDS = {
'premier_league': ['الدوري الإنجليزي', 'الدوري الإنجليزي الممتاز'],
'la_liga': ['الدوري الإسباني', 'لاليغا'],
'serie_a': ['الدوري الإيطالي'],
'bundesliga': ['الدوري الألماني'],
'ligue_1': ['الدوري الفرنسي'],
'champions_league': ['دوري أبطال أوروبا'],
'caf_champions': ['دوري أبطال إفريقيا', 'الكونفدرالية'],
'egyptian_league': ['الدوري المصري', 'دوري المحترفين'],
'saudi_league': ['الدوري السعودي', 'روشن'],
}
def detect_teams(text: str) -> list:
seen, result = set(), []
for ar, en in EGYPTIAN_TEAMS.items():
if ar in text and en not in seen:
seen.add(en)
result.append(en)
return result
def detect_league(title: str, section: str, body: str) -> str:
combined = f"{title} {section} {body[:300]}"
for lid, kws in LEAGUE_KEYWORDS.items():
if any(kw in combined for kw in kws):
return lid
SECT_MAP = {
'الكرة المصرية': 'egyptian_league',
'الدوري المصري': 'egyptian_league',
'الكرة الإفريقية': 'caf_champions',
'سعودي في الجول': 'saudi_league',
'الدوري الإنجليزي': 'premier_league',
'الكرة الأوروبية': 'champions_league',
}
for k, v in SECT_MAP.items():
if k in section:
return v
return 'other'
def chunk_text(text: str) -> list:
words = text.split()
if len(words) <= CHUNK_SIZE:
return [text]
chunks, start = [], 0
while start < len(words):
end = min(start + CHUNK_SIZE, len(words))
chunks.append(' '.join(words[start:end]))
if end == len(words):
break
start += CHUNK_SIZE - CHUNK_OVERLAP
return chunks
TYPE_AR = {
'lineup': 'تشكيلة',
'match_result': 'نتيجة مباراة',
'press_conference': 'مؤتمر صحفي',
'training': 'تدريب',
'transfer': 'ميركاتو',
'article': 'خبر',
}
def build_chunk_text(art: dict, chunk_body: str, idx: int, total: int) -> str:
parts = [f"عنوان: {art.get('title_norm') or art.get('title', '')}"]
t = art.get('article_type', 'article')
if t != 'article':
parts.append(f"نوع: {TYPE_AR.get(t, t)}")
if art.get('section'):
parts.append(f"قسم: {art['section']}")
if art.get('league', 'other') != 'other':
parts.append(f"بطولة: {art['league']}")
if art.get('teams'):
parts.append(f"الفرق: {' - '.join(art['teams'][:3])}")
if art.get('pub_date'):
parts.append(f"تاريخ: {art['pub_date'][:10]}")
prefix = ' | '.join(parts)
suffix = f"[جزء {idx+1}/{total}]\n" if total > 1 else ""
return f"{prefix}\n\n{suffix}{chunk_body}"
def run_pipeline(raw_file: Path = RAW_FILE):
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
if not Path(raw_file).exists():
log.error(f"Not found: {raw_file}")
return
articles, seen_ids = [], set()
with open(raw_file, encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
art = json.loads(line)
aid = art.get('article_id')
if aid and aid not in seen_ids:
seen_ids.add(aid)
articles.append(art)
except json.JSONDecodeError:
pass
log.info(f"Loaded {len(articles)} articles")
type_c = defaultdict(int)
league_c = defaultdict(int)
total_chunks = skipped = skipped_section = 0
body_lens = []
with open(CHUNKS_FILE, 'w', encoding='utf-8') as fout:
for art in articles:
# ── Filter: skip non-football sections ───────────────────────────
section = art.get('section', '').strip()
if section in NON_FOOTBALL_SECTIONS:
skipped_section += 1
log.debug(f"Skipped non-football section '{section}': {art.get('article_id')}")
continue
title = art.get('title', '')
bc = clean_filgoal_body(art.get('body', ''), title)
if len(bc) < 80:
skipped += 1
continue
body_lens.append(len(bc))
tn = normalize_arabic(title)
bn = normalize_arabic(bc)
teams = detect_teams(title + ' ' + bc)
league = detect_league(title, section, bc)
enriched = {
**art,
'body_clean': bc,
'title_norm': tn,
'teams': teams,
'league': league,
}
chunks = chunk_text(bn)
n = len(chunks)
for i, cb in enumerate(chunks):
fout.write(json.dumps({
'chunk_id': f"{art['article_id']}_{i}",
'article_id': art['article_id'],
'chunk_index': i,
'total_chunks': n,
'text': build_chunk_text(enriched, cb, i, n),
'title': title,
'title_norm': tn,
'body_clean': bc,
'section': section,
'article_type': art.get('article_type', 'article'),
'pub_date': art.get('pub_date', ''),
'teams': teams,
'league': league,
'tags': art.get('tags', []),
'source_url': art.get('source_url', ''),
'image': art.get('image', ''),
}, ensure_ascii=False) + '\n')
total_chunks += 1
type_c[art.get('article_type', 'article')] += 1
league_c[league] += 1
processed = len(articles) - skipped - skipped_section
avg = int(sum(body_lens) / len(body_lens)) if body_lens else 0
stats = {
'total_articles': len(articles),
'articles_processed': processed,
'skipped_short_body': skipped,
'skipped_non_football': skipped_section,
'total_chunks': total_chunks,
'avg_body_length': avg,
'min_body_length': min(body_lens) if body_lens else 0,
'max_body_length': max(body_lens) if body_lens else 0,
'article_types': dict(type_c),
'league_coverage': dict(league_c),
'processed_at': datetime.now().isoformat(),
}
STATS_FILE.write_text(json.dumps(stats, ensure_ascii=False, indent=2))
log.info(f"\n Preprocessing complete!")
log.info(f" Articles total : {len(articles)}")
log.info(f" Processed : {processed}")
log.info(f" Skipped (short body): {skipped}")
log.info(f" Skipped (non-football sections): {skipped_section} "
f"{sorted(NON_FOOTBALL_SECTIONS)}")
log.info(f" Chunks : {total_chunks}")
log.info(f" Avg body : {avg} chars")
log.info(f" Types : {dict(type_c)}")
log.info(f" Leagues : {dict(league_c)}")
log.info(f" Output : {CHUNKS_FILE}")
return stats
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='FilGoalBot Preprocessing Pipeline v4')
parser.add_argument('--input', default=str(RAW_FILE), help='Path to raw articles.jsonl')
args = parser.parse_args()
run_pipeline(Path(args.input))