Spaces:

Omar10lfc
/

Fil-RAG-Goal

Sleeping

App Files Files Community

Fil-RAG-Goal / preprocessing /pipeline.py

Omar10lfc

Initial Space deployment

18fd039 16 days ago

raw

history blame contribute delete

28.4 kB

	# """
	# FilGoalBot Preprocessing Pipeline — v3 (Production)
	# =====================================================
	# Input: data/raw/articles.jsonl
	# Output: data/processed/chunks.jsonl
	# data/processed/stats.json

	# All 9 FilGoal-specific noise patterns handled:
	# 1. Social share empty links [](https://twitter/facebook...)
	# 2. Date/byline: 'الأحد، 15 مارس 2026 - 02:08 كتب : FilGoal'
	# 3. Scoreboard widget: 1\\ TeamA\\ 1\\ TeamB\\ League
	# 4. انتهتHH:MM inline score timestamps
	# 5. Backslash sequences \\\\ (Firecrawl markdown artifact)
	# 6. News ticker sidebar: 'N دقيقة \|' or 'ساعة \|' — CUT everything after
	# 7. Related articles separator '__' — CUT everything after
	# 8. Markdown bold/italic/headers/links/images
	# 9. HTML tags, Getty captions, duplicate adjacent phrases
	# """

	# import re, json, logging, argparse
	# from pathlib import Path
	# from datetime import datetime
	# from collections import defaultdict

	# logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
	# log = logging.getLogger("preprocessing")

	# RAW_FILE = Path("data/raw/articles.jsonl")
	# OUTPUT_DIR = Path("data/processed")
	# CHUNKS_FILE = OUTPUT_DIR / "chunks.jsonl"
	# STATS_FILE = OUTPUT_DIR / "stats.json"
	# CHUNK_SIZE, CHUNK_OVERLAP = 300, 60

	# _AR_DAYS = r'(الأحد\|الإثنين\|الثلاثاء\|الأربعاء\|الخميس\|الجمعة\|السبت)'
	# _HARAKAT = re.compile(r'[\u064B-\u065F\u0670\u0640]')


	# def clean_filgoal_body(text: str, title: str = '') -> str:
	# if not text:
	# return ''
	# # 1. Empty social share links
	# text = re.sub(r'\[\]$https?://[^$]+\)\s*', '', text)
	# # 2. Date + byline header
	# text = re.sub(_AR_DAYS + r'[،,]\s\d{1,2}\s+\w+\s+\d{4}\s[-–]\s\d{2}:\d{2}\s', '', text)
	# text = re.sub(r'كتب\s:\sFilGoal\s*', '', text)
	# # 3. Scoreboard widget
	# text = re.sub(
	# r'\\\d+\\\s\\\\\s[^\\n]{2,40}\s\\\\\s*'
	# r'\\\d+\\\s\\\\\s[^\\n]{2,40}\s\\\\\s\\[^\\n]+\\',
	# '', text
	# )
	# # 4. انتهتHH:MM
	# text = re.sub(r'انتهت\d{2}:\d{2}\s*', '', text)
	# # 5. All backslashes → space (must happen BEFORE ticker cut)
	# text = re.sub(r'\\+', ' ', text)
	# # 6. CUT news ticker: 'N دقيقة \|' or 'ساعة \|'
	# ticker = re.search(r'\d+\s+دقيقة\s+\\|\|\bساعة\s+\\|', text)
	# if ticker:
	# text = text[:ticker.start()].strip()
	# # 7. CUT at __ related articles separator
	# dunder = text.find(' __ ')
	# if dunder > 200:
	# text = text[:dunder].strip()
	# # 8. Remove /articles/NNNNN paths
	# text = re.sub(r'/articles/\d+\S*', '', text)
	# # 9. Markdown → plain text
	# text = re.sub(r'\\([^\\n]+)\\*', r'\1', text)
	# text = re.sub(r'\([^\\n]+)\*', r'\1', text)
	# text = re.sub(r'#{1,6}\s+', '', text)
	# text = re.sub(r'\[([^\]]+)\]$[^$]+\)', r'\1', text)
	# text = re.sub(r'\[\]$[^$]+\)', '', text)
	# text = re.sub(r'!\[[^\]]*\]$[^$]+\)', '', text)
	# # 10. HTML tags
	# text = re.sub(r'<[^>]+>', '', text)
	# # 11. Image captions
	# text = re.sub(r'صورة\s:\s\S+', '', text)
	# text = re.sub(r'Getty\s*(Images?)?', '', text, flags=re.IGNORECASE)
	# # 12. Deduplicate repeated adjacent words
	# text = re.sub(r'\b(\S{4,})\s+\1\b', r'\1', text)
	# # 13. Final whitespace
	# text = re.sub(r'[ \t]{2,}', ' ', text).strip()
	# # 14. Remove title duplicated at start
	# if title and len(title) > 10 and text.startswith(title[:20].strip()):
	# text = text[len(title[:20]):].strip()
	# return text


	# def normalize_arabic(text: str) -> str:
	# """Normalize for embedding — do NOT use on display text."""
	# if not text:
	# return ''
	# text = _HARAKAT.sub('', text)
	# for frm, to in [('[أإآٱ]', 'ا'), ('ى', 'ي'), ('ة', 'ه'), ('ؤ', 'و')]:
	# text = re.sub(frm, to, text)
	# return re.sub(r'\s+', ' ', text).strip()


	# EGYPTIAN_TEAMS = {
	# 'الأهلي': 'al_ahly', 'الزمالك': 'zamalek', 'بيراميدز': 'pyramids',
	# 'الإسماعيلي': 'ismaily', 'المصري': 'masry', 'سيراميكا': 'ceramica',
	# 'طلائع الجيش': 'tala3a', 'فاركو': 'farco', 'حرس الحدود': 'haras',
	# 'إنبي': 'enppi', 'المقاولون': 'mokawloon', 'مودرن': 'modern',
	# 'البنك الأهلي': 'nbe', 'غزل المحلة': 'ghazl', 'سموحة': 'smouha',
	# 'الجونة': 'el_gouna',
	# }

	# LEAGUE_KEYWORDS = {
	# 'premier_league': ['الدوري الإنجليزي', 'الدوري الإنجليزي الممتاز'],
	# 'la_liga': ['الدوري الإسباني', 'لاليغا'],
	# 'serie_a': ['الدوري الإيطالي'],
	# 'bundesliga': ['الدوري الألماني'],
	# 'ligue_1': ['الدوري الفرنسي'],
	# 'champions_league': ['دوري أبطال أوروبا'],
	# 'caf_champions': ['دوري أبطال إفريقيا', 'الكونفدرالية'],
	# 'egyptian_league': ['الدوري المصري', 'دوري المحترفين'],
	# 'saudi_league': ['الدوري السعودي', 'روشن'],
	# }

	# def detect_teams(text: str) -> list:
	# seen, result = set(), []
	# for ar, en in EGYPTIAN_TEAMS.items():
	# if ar in text and en not in seen:
	# seen.add(en); result.append(en)
	# return result

	# def detect_league(title: str, section: str, body: str) -> str:
	# combined = f"{title} {section} {body[:300]}"
	# for lid, kws in LEAGUE_KEYWORDS.items():
	# if any(kw in combined for kw in kws):
	# return lid
	# SECT_MAP = {
	# 'الكرة المصرية': 'egyptian_league', 'الدوري المصري': 'egyptian_league',
	# 'الكرة الإفريقية': 'caf_champions', 'سعودي في الجول': 'saudi_league',
	# 'الدوري الإنجليزي': 'premier_league','الكرة الأوروبية': 'champions_league',
	# }
	# for k, v in SECT_MAP.items():
	# if k in section:
	# return v
	# return 'other'


	# def chunk_text(text: str) -> list:
	# words = text.split()
	# if len(words) <= CHUNK_SIZE:
	# return [text]
	# chunks, start = [], 0
	# while start < len(words):
	# end = min(start + CHUNK_SIZE, len(words))
	# chunks.append(' '.join(words[start:end]))
	# if end == len(words):
	# break
	# start += CHUNK_SIZE - CHUNK_OVERLAP
	# return chunks


	# TYPE_AR = {
	# 'lineup': 'تشكيلة', 'match_result': 'نتيجة مباراة',
	# 'press_conference': 'مؤتمر صحفي', 'training': 'تدريب',
	# 'transfer': 'ميركاتو', 'article': 'خبر',
	# }

	# def build_chunk_text(art: dict, chunk_body: str, idx: int, total: int) -> str:
	# parts = [f"عنوان: {art.get('title_norm') or art.get('title', '')}"]
	# t = art.get('article_type', 'article')
	# if t != 'article':
	# parts.append(f"نوع: {TYPE_AR.get(t, t)}")
	# if art.get('section'):
	# parts.append(f"قسم: {art['section']}")
	# if art.get('league', 'other') != 'other':
	# parts.append(f"بطولة: {art['league']}")
	# if art.get('teams'):
	# parts.append(f"الفرق: {' - '.join(art['teams'][:3])}")
	# if art.get('pub_date'):
	# parts.append(f"تاريخ: {art['pub_date'][:10]}")
	# prefix = ' \| '.join(parts)
	# suffix = f"[جزء {idx+1}/{total}]\n" if total > 1 else ""
	# return f"{prefix}\n\n{suffix}{chunk_body}"


	# def run_pipeline(raw_file: Path = RAW_FILE):
	# OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
	# if not Path(raw_file).exists():
	# log.error(f"Not found: {raw_file}")
	# return

	# articles, seen_ids = [], set()
	# with open(raw_file, encoding='utf-8') as f:
	# for line in f:
	# line = line.strip()
	# if not line:
	# continue
	# try:
	# art = json.loads(line)
	# aid = art.get('article_id')
	# if aid and aid not in seen_ids:
	# seen_ids.add(aid)
	# articles.append(art)
	# except json.JSONDecodeError:
	# pass

	# log.info(f"Loaded {len(articles)} articles")

	# type_c = defaultdict(int)
	# league_c = defaultdict(int)
	# total_chunks = skipped = 0
	# body_lens = []

	# with open(CHUNKS_FILE, 'w', encoding='utf-8') as fout:
	# for art in articles:
	# title = art.get('title', '')
	# bc = clean_filgoal_body(art.get('body', ''), title)
	# if len(bc) < 80:
	# skipped += 1
	# continue
	# body_lens.append(len(bc))
	# tn = normalize_arabic(title)
	# bn = normalize_arabic(bc)
	# teams = detect_teams(title + ' ' + bc)
	# league = detect_league(title, art.get('section', ''), bc)
	# enriched = {**art, 'body_clean': bc, 'title_norm': tn, 'teams': teams, 'league': league}
	# chunks = chunk_text(bn)
	# n = len(chunks)
	# for i, cb in enumerate(chunks):
	# fout.write(json.dumps({
	# 'chunk_id': f"{art['article_id']}_{i}",
	# 'article_id': art['article_id'],
	# 'chunk_index': i,
	# 'total_chunks': n,
	# 'text': build_chunk_text(enriched, cb, i, n),
	# 'title': title,
	# 'title_norm': tn,
	# 'body_clean': bc,
	# 'section': art.get('section', ''),
	# 'article_type': art.get('article_type', 'article'),
	# 'pub_date': art.get('pub_date', ''),
	# 'teams': teams,
	# 'league': league,
	# 'tags': art.get('tags', []),
	# 'source_url': art.get('source_url', ''),
	# 'image': art.get('image', ''),
	# }, ensure_ascii=False) + '\n')
	# total_chunks += 1
	# type_c[art.get('article_type', 'article')] += 1
	# league_c[league] += 1

	# avg = int(sum(body_lens) / len(body_lens)) if body_lens else 0
	# stats = {
	# 'total_articles': len(articles),
	# 'articles_processed': len(articles) - skipped,
	# 'skipped': skipped,
	# 'total_chunks': total_chunks,
	# 'avg_body_length': avg,
	# 'min_body_length': min(body_lens) if body_lens else 0,
	# 'max_body_length': max(body_lens) if body_lens else 0,
	# 'article_types': dict(type_c),
	# 'league_coverage': dict(league_c),
	# 'processed_at': datetime.now().isoformat(),
	# }
	# STATS_FILE.write_text(json.dumps(stats, ensure_ascii=False, indent=2))

	# log.info(f"\n Preprocessing complete!")
	# log.info(f" Articles : {len(articles) - skipped} / {len(articles)}")
	# log.info(f" Chunks : {total_chunks}")
	# log.info(f" Avg body : {avg} chars")
	# log.info(f" Types : {dict(type_c)}")
	# log.info(f" Leagues : {dict(league_c)}")
	# log.info(f" Output : {CHUNKS_FILE}")
	# return stats


	# if __name__ == '__main__':
	# parser = argparse.ArgumentParser(description='FilGoalBot Preprocessing Pipeline')
	# parser.add_argument('--input', default=str(RAW_FILE), help='Path to raw articles.jsonl')
	# args = parser.parse_args()
	# run_pipeline(Path(args.input))


	"""
	FilGoalBot Preprocessing Pipeline — v4 (Production)
	=====================================================
	Input: data/raw/articles.jsonl
	Output: data/processed/chunks.jsonl
	data/processed/stats.json

	All 9 FilGoal-specific noise patterns handled:
	1. Social share empty links [](https://twitter/facebook...)
	2. Date/byline: 'الأحد، 15 مارس 2026 - 02:08 كتب : FilGoal'
	3. Scoreboard widget: 1\\ TeamA\\ 1\\ TeamB\\ League
	4. انتهتHH:MM inline score timestamps
	5. Backslash sequences \\\\ (Firecrawl markdown artifact)
	6. News ticker sidebar: 'N دقيقة \|' or 'ساعة \|' — CUT everything after
	7. Related articles separator '__' — CUT everything after
	8. Markdown bold/italic/headers/links/images
	9. HTML tags, Getty captions, duplicate adjacent phrases

	v4 changes:
	- Skip non-football sections: كرة يد / كرة سلة / كرة طائرة
	- video:1 placeholders removed
	- Embedded tweet pic links pic.twitter.com stripped
	- Hashtags stripped (including escaped \\_)
	- Angle-bracket tweet separator > replaced with space
	- FilGoal domain refs filgoal.com/... stripped
	- Client JS artifact at tail stripped
	- HaytersTV promo paragraph stripped
	- English tweet dates (March 14, 2026) stripped
	- YouTube label + iframe block stripped
	- beIN Sports handles @beINSPORTS stripped
	- Mixed bold-italic _text_ stripped
	- Emojis removed
	- Tashkeel/tatweel diacritics removed (normalization)
	- Alef variants أ إ آ → ا (normalization)
	"""

	import re, json, logging, argparse
	from pathlib import Path
	from datetime import datetime
	from collections import defaultdict

	logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
	log = logging.getLogger("preprocessing")

	RAW_FILE = Path("data/raw/articles.jsonl")
	OUTPUT_DIR = Path("data/processed")
	CHUNKS_FILE = OUTPUT_DIR / "chunks.jsonl"
	STATS_FILE = OUTPUT_DIR / "stats.json"
	CHUNK_SIZE, CHUNK_OVERLAP = 300, 60

	# Sections to skip entirely — not football content
	NON_FOOTBALL_SECTIONS = {
	'كرة يد',
	'كرة سلة',
	'كرة طائرة',
	'رياضات أخرى',
	}

	_AR_DAYS = r'(الأحد\|الإثنين\|الثلاثاء\|الأربعاء\|الخميس\|الجمعة\|السبت)'
	_HARAKAT = re.compile(r'[\u064B-\u065F\u0670\u0640]')
	_EMOJI = re.compile(
	r'[\U0001F300-\U0001F9FF'
	r'\U00002600-\U000027FF'
	r'\U0000FE00-\U0000FE0F'
	r'\U0001FA00-\U0001FA9F]+',
	flags=re.UNICODE,
	)

	_ENGLISH_MONTHS = (
	r'(January\|February\|March\|April\|May\|June\|'
	r'July\|August\|September\|October\|November\|December)'
	)


	def clean_filgoal_body(text: str, title: str = '') -> str:
	if not text:
	return ''

	# ── Phase A: pre-backslash patterns ──────────────────────────────────────

	# 1. Empty social share links
	text = re.sub(r'\[\]$https?://[^$]+\)\s*', '', text)

	# 2. Date + byline header
	text = re.sub(
	_AR_DAYS + r'[،,]\s\d{1,2}\s+\w+\s+\d{4}\s[-–]\s\d{2}:\d{2}\s',
	'', text,
	)
	text = re.sub(r'كتب\s:\sFilGoal\s*', '', text)

	# 3. Scoreboard widget 1\\ TeamA\\ 1\\ TeamB\\ League
	text = re.sub(
	r'\\\d+\\\s\\\\\s[^\\n]{2,40}\s\\\\\s*'
	r'\\\d+\\\s\\\\\s[^\\n]{2,40}\s\\\\\s\\[^\\n]+\\',
	'', text,
	)

	# 4. انتهتHH:MM
	text = re.sub(r'انتهت\d{2}:\d{2}\s*', '', text)

	# 5. video:N placeholders
	text = re.sub(r'\bvideo:\d+\b', '', text)

	# 6. Embedded tweet pic links
	text = re.sub(r'https?://pic\.twitter\.com/\S+', '', text)
	text = re.sub(r'pic\.twitter\.com/\S+', '', text)

	# 7. Hashtags (plain and escaped-underscore forms)
	text = re.sub(r'#[\w\u0600-\u06FF]+(?:\\_[\w\u0600-\u06FF]+)*', '', text)

	# 8. Angle-bracket tweet separator lines (">")
	text = re.sub(r'(?m)^>\s*', ' ', text)

	# 9. FilGoal domain references — three variants
	text = re.sub(r'https?://(?:www\.)?filgoal\.com/\S*', '', text)
	text = re.sub(r'(?:www\.)?filgoal\.com/\S*', '', text)
	text = re.sub(r'\bfilgoal\.com\b', '', text)

	# ── Phase B: hard cuts ────────────────────────────────────────────────────

	# 10. All backslashes → space (must happen BEFORE ticker/related cuts)
	text = re.sub(r'\\+', ' ', text)

	# 11. CUT news ticker: 'N دقيقة \|' or 'ساعة \|'
	ticker = re.search(r'\d+\s+دقيقة\s+\\|\|\bساعة\s+\\|', text)
	if ticker:
	text = text[:ticker.start()].strip()

	# 12. CUT at __ related articles separator
	dunder = text.find(' __ ')
	if dunder > 200:
	text = text[:dunder].strip()

	# ── Phase C: post-cut cleanup ─────────────────────────────────────────────

	# 13. Remove /articles/NNNNN paths
	text = re.sub(r'/articles/\d+\S*', '', text)

	# 14. Strip "Client" JS artifact at tail
	text = re.sub(r'\bClient\b.*$', '', text, flags=re.DOTALL)

	# 15. HaytersTV promo paragraph
	text = re.sub(
	r'هايترز\sتي\sفي.?(?:يوتيوب\|YouTube\|اشترك)[^\n]',
	'', text, flags=re.IGNORECASE \| re.DOTALL,
	)

	# 16. YouTube label + iframe block
	text = re.sub(r'\bYouTube\b[^\n]*', '', text, flags=re.IGNORECASE)
	text = re.sub(
	r'\d[\d,\.]\s(?:مشترك\|subscriber)[^\n]*',
	'', text, flags=re.IGNORECASE,
	)

	# 17. beIN Sports handles and attributions
	text = re.sub(r'@beIN\w*', '', text, flags=re.IGNORECASE)
	text = re.sub(r'beIN\sSports?\s(?:عربي\|العربية\|Arabic)?', '', text, flags=re.IGNORECASE)

	# 18. English tweet dates "March 14, 2026" / "14 March 2026"
	text = re.sub(
	r'\b\d{1,2}\s+' + _ENGLISH_MONTHS + r'\s+\d{4}\b', '', text,
	)
	text = re.sub(
	_ENGLISH_MONTHS + r'\s+\d{1,2},?\s+\d{4}\b', '', text,
	)

	# 19. Mixed bold-italic _text_ or _text_
	text = re.sub(r'_\\([^\\n]+)\\*_', r'\1', text)
	text = re.sub(r'\\_([^_\n]+)_\\', r'\1', text)

	# 20. Emojis
	text = _EMOJI.sub('', text)

	# ── Phase D: standard markdown → plain text ───────────────────────────────

	text = re.sub(r'\\([^\\n]+)\\*', r'\1', text)
	text = re.sub(r'\([^\\n]+)\*', r'\1', text)
	text = re.sub(r'#{1,6}\s+', '', text)
	text = re.sub(r'\[([^\]]+)\]$[^$]+\)', r'\1', text)
	text = re.sub(r'\[\]$[^$]+\)', '', text)
	text = re.sub(r'!\[[^\]]*\]$[^$]+\)', '', text)

	# ── Phase E: HTML + captions ──────────────────────────────────────────────

	text = re.sub(r'<[^>]+>', '', text)
	text = re.sub(r'صورة\s:\s\S+', '', text)
	text = re.sub(r'Getty\s*(Images?)?', '', text, flags=re.IGNORECASE)

	# ── Phase F: final normalisation ──────────────────────────────────────────

	# Deduplicate repeated adjacent words
	text = re.sub(r'\b(\S{4,})\s+\1\b', r'\1', text)
	# Collapse whitespace
	text = re.sub(r'[ \t]{2,}', ' ', text).strip()
	# Remove title duplicated at start
	if title and len(title) > 10 and text.startswith(title[:20].strip()):
	text = text[len(title[:20]):].strip()

	return text


	def normalize_arabic(text: str) -> str:
	"""Normalize for embedding — do NOT use on display text."""
	if not text:
	return ''
	text = _HARAKAT.sub('', text)
	for frm, to in [
	('[أإآٱ]', 'ا'),
	('ى', 'ي'),
	('ة', 'ه'),
	('ؤ', 'و'),
	]:
	text = re.sub(frm, to, text)
	return re.sub(r'\s+', ' ', text).strip()


	EGYPTIAN_TEAMS = {
	'الأهلي': 'al_ahly',
	'الزمالك': 'zamalek',
	'بيراميدز': 'pyramids',
	'الإسماعيلي': 'ismaily',
	'المصري': 'masry',
	'سيراميكا': 'ceramica',
	'طلائع الجيش': 'tala3a',
	'فاركو': 'farco',
	'حرس الحدود': 'haras',
	'إنبي': 'enppi',
	'المقاولون': 'mokawloon',
	'مودرن': 'modern',
	'البنك الأهلي': 'nbe',
	'غزل المحلة': 'ghazl',
	'سموحة': 'smouha',
	'الجونة': 'el_gouna',
	}

	LEAGUE_KEYWORDS = {
	'premier_league': ['الدوري الإنجليزي', 'الدوري الإنجليزي الممتاز'],
	'la_liga': ['الدوري الإسباني', 'لاليغا'],
	'serie_a': ['الدوري الإيطالي'],
	'bundesliga': ['الدوري الألماني'],
	'ligue_1': ['الدوري الفرنسي'],
	'champions_league': ['دوري أبطال أوروبا'],
	'caf_champions': ['دوري أبطال إفريقيا', 'الكونفدرالية'],
	'egyptian_league': ['الدوري المصري', 'دوري المحترفين'],
	'saudi_league': ['الدوري السعودي', 'روشن'],
	}


	def detect_teams(text: str) -> list:
	seen, result = set(), []
	for ar, en in EGYPTIAN_TEAMS.items():
	if ar in text and en not in seen:
	seen.add(en)
	result.append(en)
	return result


	def detect_league(title: str, section: str, body: str) -> str:
	combined = f"{title} {section} {body[:300]}"
	for lid, kws in LEAGUE_KEYWORDS.items():
	if any(kw in combined for kw in kws):
	return lid
	SECT_MAP = {
	'الكرة المصرية': 'egyptian_league',
	'الدوري المصري': 'egyptian_league',
	'الكرة الإفريقية': 'caf_champions',
	'سعودي في الجول': 'saudi_league',
	'الدوري الإنجليزي': 'premier_league',
	'الكرة الأوروبية': 'champions_league',
	}
	for k, v in SECT_MAP.items():
	if k in section:
	return v
	return 'other'


	def chunk_text(text: str) -> list:
	words = text.split()
	if len(words) <= CHUNK_SIZE:
	return [text]
	chunks, start = [], 0
	while start < len(words):
	end = min(start + CHUNK_SIZE, len(words))
	chunks.append(' '.join(words[start:end]))
	if end == len(words):
	break
	start += CHUNK_SIZE - CHUNK_OVERLAP
	return chunks


	TYPE_AR = {
	'lineup': 'تشكيلة',
	'match_result': 'نتيجة مباراة',
	'press_conference': 'مؤتمر صحفي',
	'training': 'تدريب',
	'transfer': 'ميركاتو',
	'article': 'خبر',
	}


	def build_chunk_text(art: dict, chunk_body: str, idx: int, total: int) -> str:
	parts = [f"عنوان: {art.get('title_norm') or art.get('title', '')}"]
	t = art.get('article_type', 'article')
	if t != 'article':
	parts.append(f"نوع: {TYPE_AR.get(t, t)}")
	if art.get('section'):
	parts.append(f"قسم: {art['section']}")
	if art.get('league', 'other') != 'other':
	parts.append(f"بطولة: {art['league']}")
	if art.get('teams'):
	parts.append(f"الفرق: {' - '.join(art['teams'][:3])}")
	if art.get('pub_date'):
	parts.append(f"تاريخ: {art['pub_date'][:10]}")
	prefix = ' \| '.join(parts)
	suffix = f"[جزء {idx+1}/{total}]\n" if total > 1 else ""
	return f"{prefix}\n\n{suffix}{chunk_body}"


	def run_pipeline(raw_file: Path = RAW_FILE):
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
	if not Path(raw_file).exists():
	log.error(f"Not found: {raw_file}")
	return

	articles, seen_ids = [], set()
	with open(raw_file, encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	try:
	art = json.loads(line)
	aid = art.get('article_id')
	if aid and aid not in seen_ids:
	seen_ids.add(aid)
	articles.append(art)
	except json.JSONDecodeError:
	pass

	log.info(f"Loaded {len(articles)} articles")

	type_c = defaultdict(int)
	league_c = defaultdict(int)
	total_chunks = skipped = skipped_section = 0
	body_lens = []

	with open(CHUNKS_FILE, 'w', encoding='utf-8') as fout:
	for art in articles:
	# ── Filter: skip non-football sections ───────────────────────────
	section = art.get('section', '').strip()
	if section in NON_FOOTBALL_SECTIONS:
	skipped_section += 1
	log.debug(f"Skipped non-football section '{section}': {art.get('article_id')}")
	continue

	title = art.get('title', '')
	bc = clean_filgoal_body(art.get('body', ''), title)
	if len(bc) < 80:
	skipped += 1
	continue

	body_lens.append(len(bc))
	tn = normalize_arabic(title)
	bn = normalize_arabic(bc)
	teams = detect_teams(title + ' ' + bc)
	league = detect_league(title, section, bc)

	enriched = {
	**art,
	'body_clean': bc,
	'title_norm': tn,
	'teams': teams,
	'league': league,
	}
	chunks = chunk_text(bn)
	n = len(chunks)

	for i, cb in enumerate(chunks):
	fout.write(json.dumps({
	'chunk_id': f"{art['article_id']}_{i}",
	'article_id': art['article_id'],
	'chunk_index': i,
	'total_chunks': n,
	'text': build_chunk_text(enriched, cb, i, n),
	'title': title,
	'title_norm': tn,
	'body_clean': bc,
	'section': section,
	'article_type': art.get('article_type', 'article'),
	'pub_date': art.get('pub_date', ''),
	'teams': teams,
	'league': league,
	'tags': art.get('tags', []),
	'source_url': art.get('source_url', ''),
	'image': art.get('image', ''),
	}, ensure_ascii=False) + '\n')
	total_chunks += 1

	type_c[art.get('article_type', 'article')] += 1
	league_c[league] += 1

	processed = len(articles) - skipped - skipped_section
	avg = int(sum(body_lens) / len(body_lens)) if body_lens else 0

	stats = {
	'total_articles': len(articles),
	'articles_processed': processed,
	'skipped_short_body': skipped,
	'skipped_non_football': skipped_section,
	'total_chunks': total_chunks,
	'avg_body_length': avg,
	'min_body_length': min(body_lens) if body_lens else 0,
	'max_body_length': max(body_lens) if body_lens else 0,
	'article_types': dict(type_c),
	'league_coverage': dict(league_c),
	'processed_at': datetime.now().isoformat(),
	}
	STATS_FILE.write_text(json.dumps(stats, ensure_ascii=False, indent=2))

	log.info(f"\n Preprocessing complete!")
	log.info(f" Articles total : {len(articles)}")
	log.info(f" Processed : {processed}")
	log.info(f" Skipped (short body): {skipped}")
	log.info(f" Skipped (non-football sections): {skipped_section} "
	f"{sorted(NON_FOOTBALL_SECTIONS)}")
	log.info(f" Chunks : {total_chunks}")
	log.info(f" Avg body : {avg} chars")
	log.info(f" Types : {dict(type_c)}")
	log.info(f" Leagues : {dict(league_c)}")
	log.info(f" Output : {CHUNKS_FILE}")
	return stats


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='FilGoalBot Preprocessing Pipeline v4')
	parser.add_argument('--input', default=str(RAW_FILE), help='Path to raw articles.jsonl')
	args = parser.parse_args()
	run_pipeline(Path(args.input))