Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| import os | |
| import re | |
| import json | |
| import time | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Tuple | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from readability import Document | |
| # PDF fallback for arXiv / PDFs | |
| from pdfminer.high_level import extract_text as pdfminer_extract_text | |
| # ----------------------------- | |
| # Output | |
| # ----------------------------- | |
| OUT_DIR = os.environ.get("RAG_OUT_DIR", "data/normalized") | |
| OUT_JSONL = os.path.join(OUT_DIR, "chunks_articles.jsonl") | |
| OUT_MANIFEST = os.path.join(OUT_DIR, "manifest_articles.json") | |
| # ----------------------------- | |
| # Fetch config | |
| # ----------------------------- | |
| HEADERS = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/121.0.0.0 Safari/537.36" | |
| ), | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Cache-Control": "no-cache", | |
| "Pragma": "no-cache", | |
| } | |
| TIMEOUT_S = 30 | |
| # ----------------------------- | |
| # Sources (latest recommendations) | |
| # ----------------------------- | |
| # ----------------------------- | |
| # Sources file (recommended) | |
| # ----------------------------- | |
| SOURCES_FILE = os.environ.get("RAG_ARTICLE_SOURCES", "sources_articles.json") | |
| def load_sources() -> List[Dict]: | |
| # Prefer JSON config so users can add sources without editing code. | |
| p = Path(SOURCES_FILE) | |
| if p.exists(): | |
| data = json.loads(p.read_text(encoding="utf-8")) | |
| if not isinstance(data, list): | |
| raise ValueError(f"{SOURCES_FILE} must be a JSON list of sources") | |
| return data | |
| return [] | |
| SOURCES: List[Dict] = load_sources() or [ | |
| { | |
| "id": "anthropic_multi_agent_research_system", | |
| "type": "html", | |
| "publisher": "Anthropic", | |
| "url": "https://www.anthropic.com/engineering/multi-agent-research-system", | |
| }, | |
| { | |
| "id": "anthropic_agentic_misalignment", | |
| "type": "html", | |
| "publisher": "Anthropic", | |
| "url": "https://www.anthropic.com/research/agentic-misalignment", | |
| }, | |
| { | |
| "id": "react_arxiv_2210_03629", | |
| "type": "pdf", | |
| "publisher": "arXiv", | |
| "url": "https://arxiv.org/pdf/2210.03629.pdf", | |
| }, | |
| { | |
| "id": "rag_arxiv_2005_11401", | |
| "type": "pdf", | |
| "publisher": "arXiv", | |
| "url": "https://arxiv.org/pdf/2005.11401.pdf", | |
| }, | |
| { | |
| "id": "toolformer_arxiv_2302_04761", | |
| "type": "pdf", | |
| "publisher": "arXiv", | |
| "url": "https://arxiv.org/pdf/2302.04761.pdf", | |
| }, | |
| { | |
| "id": "tds_single_vs_multi_agent_systems", | |
| "type": "html", | |
| "publisher": "Towards Data Science", | |
| "url": "https://towardsdatascience.com/agentic-ai-single-vs-multi-agent-systems/", | |
| }, | |
| { | |
| "id": "tds_langgraph_101_deep_research_agent", | |
| "type": "html", | |
| "publisher": "Towards Data Science", | |
| "url": "https://towardsdatascience.com/langgraph-101-lets-build-a-deep-research-agent/", | |
| }, | |
| { | |
| "id": "tds_effective_ai_agents_at_scale", | |
| "type": "html", | |
| "publisher": "Towards Data Science", | |
| "url": "https://towardsdatascience.com/how-to-build-effective-ai-agents-to-process-millions-of-requests/", | |
| }, | |
| { | |
| "id": "ai_sdk_mcp_tools", | |
| "type": "html", | |
| "publisher": "AI SDK", | |
| "url": "https://ai-sdk.dev/docs/ai-sdk-core/mcp-tools" | |
| }, | |
| { | |
| "id": "byteplus_mcp_topic", | |
| "type": "html", | |
| "publisher": "BytePlus", | |
| "url": "https://www.byteplus.com/en/topic/542256?title=" | |
| }, | |
| { | |
| "id": "merge_mcp_tool_schema", | |
| "type": "html", | |
| "publisher": "Merge.dev", | |
| "url": "https://www.merge.dev/blog/mcp-tool-schema" | |
| }, | |
| { | |
| "id": "netfoundry_ai_agent_mcp_decision", | |
| "type": "html", | |
| "publisher": "NetFoundry", | |
| "url": "https://netfoundry.io/ai/how-an-ai-agent-decides-to-call-mcp-tools/" | |
| }, | |
| { | |
| "id": "modelcontextprotocol_github", | |
| "type": "html", | |
| "publisher": "Model Context Protocol", | |
| "url": "https://github.com/modelcontextprotocol/modelcontextprotocol" | |
| }, | |
| { | |
| "id": "devto_react_vs_plan_execute", | |
| "type": "html", | |
| "publisher": "Dev.to", | |
| "url": "https://dev.to/jamesli/react-vs-plan-and-execute-a-practical-comparison-of-llm-agent-patterns-4gh9" | |
| }, | |
| { | |
| "id": "byaiteam_agent_planning_reliability", | |
| "type": "html", | |
| "publisher": "By AI Team", | |
| "url": "https://byaiteam.com/blog/2025/12/09/ai-agent-planning-react-vs-plan-and-execute-for-reliability/" | |
| }, | |
| { | |
| "id": "linkedin_build_ai_agent_post", | |
| "type": "html", | |
| "publisher": "LinkedIn", | |
| "url": "https://www.linkedin.com/posts/lewisowain_how-to-build-an-ai-agent-activity-7402339630764941312-_G5h/" | |
| }, | |
| { | |
| "id": "scitepress_multiagent_paper_2021", | |
| "type": "pdf", | |
| "publisher": "SciTePress", | |
| "url": "https://www.scitepress.org/Papers/2021/105593/105593.pdf" | |
| }, | |
| { | |
| "id": "geeksforgeeks_informed_vs_uninformed_search", | |
| "type": "html", | |
| "publisher": "GeeksforGeeks", | |
| "url": "https://www.geeksforgeeks.org/artificial-intelligence/difference-between-informed-and-uninformed-search-in-ai/" | |
| }, | |
| { | |
| "id": "baeldung_informed_vs_uninformed_search", | |
| "type": "html", | |
| "publisher": "Baeldung", | |
| "url": "https://www.baeldung.com/cs/informed-vs-uninformed-search" | |
| }, | |
| { | |
| "id": "scaler_informed_vs_uninformed_search", | |
| "type": "html", | |
| "publisher": "Scaler", | |
| "url": "https://www.scaler.com/topics/difference-between-informed-and-uninformed-search/" | |
| }, | |
| { | |
| "id": "scipub_agent_search_paper_2021", | |
| "type": "pdf", | |
| "publisher": "Science Publications", | |
| "url": "https://thescipub.com/pdf/jcssp.2021.1147.1156.pdf" | |
| }, | |
| { | |
| "id": "ibm_ai_agent_orchestration", | |
| "type": "html", | |
| "publisher": "IBM", | |
| "url": "https://www.ibm.com/think/topics/ai-agent-orchestration" | |
| }, | |
| { | |
| "id": "domo_ai_agent_orchestration", | |
| "type": "html", | |
| "publisher": "Domo", | |
| "url": "https://www.domo.com/glossary/ai-agent-orchestration" | |
| }, | |
| { | |
| "id": "aimultiple_agentic_frameworks", | |
| "type": "html", | |
| "publisher": "AI Multiple", | |
| "url": "https://research.aimultiple.com/agentic-frameworks/" | |
| }, | |
| { | |
| "id": "reddit_multiagent_system_evaluator", | |
| "type": "html", | |
| "publisher": "Reddit", | |
| "url": "https://www.reddit.com/r/PromptSynergy/comments/1np7wxw/multiagent_system_evaluator_with_40point_analysis/" | |
| }, | |
| { | |
| "id": "dextra_ai_agent_orchestration", | |
| "type": "html", | |
| "publisher": "Dextra Labs", | |
| "url": "https://dextralabs.com/blog/what-is-ai-agent-orchestration/" | |
| }, | |
| { | |
| "id": "kubiya_agent_orchestration_frameworks", | |
| "type": "html", | |
| "publisher": "Kubiya", | |
| "url": "https://www.kubiya.ai/blog/ai-agent-orchestration-frameworks" | |
| }, | |
| { | |
| "id": "projectpro_ai_agent_evaluation", | |
| "type": "html", | |
| "publisher": "ProjectPro", | |
| "url": "https://www.projectpro.io/article/ai-agent-evaluation/1178" | |
| }, | |
| { | |
| "id": "zyrix_multi_agent_testing_guide_2025", | |
| "type": "html", | |
| "publisher": "Zyrix AI", | |
| "url": "https://zyrix.ai/blogs/multi-agent-ai-testing-guide-2025/" | |
| } | |
| ] | |
| # ----------------------------- | |
| # Utilities | |
| # ----------------------------- | |
| def now_iso() -> str: | |
| return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") | |
| def clean_ws(s: str) -> str: | |
| s = s.replace("\r\n", "\n").replace("\r", "\n") | |
| s = re.sub(r"\n{3,}", "\n\n", s) | |
| s = re.sub(r"[ \t]{2,}", " ", s) | |
| return s.strip() | |
| STOPWORDS = { | |
| "a","an","and","are","as","at","be","but","by","can","do","does","for","from","how","i","if","in","is","it","of","on","or", | |
| "that","the","their","then","there","these","this","to","was","were","what","when","where","which","who","why","with","you","your" | |
| } | |
| def chunk_text(text: str, size: int = 1200, overlap: int = 150) -> List[str]: | |
| text = text.strip() | |
| if not text: | |
| return [] | |
| chunks = [] | |
| start = 0 | |
| n = len(text) | |
| while start < n: | |
| end = min(start + size, n) | |
| chunks.append(text[start:end]) | |
| if end == n: | |
| break | |
| start = max(0, end - overlap) | |
| return chunks | |
| def extract_tags(text: str, title: Optional[str], max_tags: int = 8) -> List[str]: | |
| content = " ".join([t for t in [title, text] if t]) | |
| tokens = re.findall(r"[A-Za-z][A-Za-z0-9_]{2,}", content) | |
| lowered = [t.lower() for t in tokens if t.lower() not in STOPWORDS] | |
| freq = {} | |
| for t in lowered: | |
| freq[t] = freq.get(t, 0) + 1 | |
| keywords = sorted(freq.keys(), key=lambda k: (-freq[k], k))[:max_tags] | |
| entities = [] | |
| for m in re.findall(r"\b[A-Z][a-zA-Z]+\b(?:\s+[A-Z][a-zA-Z]+\b){0,2}", content): | |
| ent = m.strip() | |
| if ent.lower() in STOPWORDS: | |
| continue | |
| if ent not in entities: | |
| entities.append(ent) | |
| if len(entities) >= max_tags: | |
| break | |
| tags = [] | |
| for k in keywords + entities: | |
| if k and k not in tags: | |
| tags.append(k) | |
| return tags[:max_tags] | |
| def normalize_url(url: str) -> str: | |
| if url.endswith("title="): | |
| return url[:-6].rstrip("?&") | |
| return url | |
| def extract_visible_text(html: str) -> str: | |
| soup = BeautifulSoup(html, "html.parser") | |
| for tag in soup(["script", "style", "noscript", "svg", "header", "footer", "nav", "aside"]): | |
| tag.decompose() | |
| text = soup.get_text("\n") | |
| return clean_ws(text) | |
| def safe_get(session: requests.Session, url: str) -> requests.Response: | |
| # basic retry for transient blocks | |
| last_exc = None | |
| for attempt in range(3): | |
| try: | |
| r = session.get(url, timeout=TIMEOUT_S, allow_redirects=True) | |
| return r | |
| except Exception as e: | |
| last_exc = e | |
| time.sleep(1.25 * (attempt + 1)) | |
| raise last_exc | |
| # ----------------------------- | |
| # Metadata extraction (best effort) | |
| # ----------------------------- | |
| def extract_meta_from_html(html: str, url: str) -> Tuple[str, Optional[str], Optional[str]]: | |
| """ | |
| Returns: (title, author, publication_date_iso) | |
| Best-effort using meta tags commonly found in blogs/news sites. | |
| """ | |
| soup = BeautifulSoup(html, "html.parser") | |
| title = "" | |
| if soup.title and soup.title.get_text(strip=True): | |
| title = soup.title.get_text(strip=True) | |
| # Common meta tags | |
| def meta(name: str) -> Optional[str]: | |
| tag = soup.find("meta", attrs={"name": name}) | |
| if tag and tag.get("content"): | |
| return tag["content"].strip() | |
| tag = soup.find("meta", attrs={"property": name}) | |
| if tag and tag.get("content"): | |
| return tag["content"].strip() | |
| return None | |
| title2 = meta("og:title") or meta("twitter:title") | |
| if title2: | |
| title = title2 | |
| author = meta("author") or meta("article:author") or meta("og:article:author") | |
| pub = meta("article:published_time") or meta("og:article:published_time") or meta("pubdate") or meta("date") | |
| # Normalize date to ISO if possible (keep as-is if parsing fails) | |
| pub_iso = None | |
| if pub: | |
| # Many sites already provide ISO; keep it if it looks like ISO | |
| if re.match(r"^\d{4}-\d{2}-\d{2}", pub): | |
| pub_iso = pub | |
| else: | |
| # Try minimal parsing like "Jan 10, 2025" | |
| try: | |
| from dateutil import parser as dtparser # python-dateutil in requirements | |
| pub_iso = dtparser.parse(pub).astimezone(timezone.utc).isoformat().replace("+00:00", "Z") | |
| except Exception: | |
| pub_iso = pub # best-effort fallback | |
| return title.strip(), (author.strip() if author else None), (pub_iso.strip() if pub_iso else None) | |
| # ----------------------------- | |
| # HTML extraction | |
| # ----------------------------- | |
| def extract_main_text_readability(html: str) -> Tuple[str, str]: | |
| doc = Document(html) | |
| title = doc.short_title() or "" | |
| summary_html = doc.summary(html_partial=True) | |
| soup = BeautifulSoup(summary_html, "html.parser") | |
| parts = [] | |
| for el in soup.find_all(["h1", "h2", "h3", "p", "li"]): | |
| t = el.get_text(" ", strip=True) | |
| if t: | |
| parts.append(t) | |
| text = "\n".join(parts) | |
| return title.strip(), clean_ws(text) | |
| def fetch_html_article(session: requests.Session, url: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: | |
| url = normalize_url(url) | |
| r = safe_get(session, url) | |
| if r.status_code == 403: | |
| return None, None, None, f"403 Forbidden (site blocked requests): {url}" | |
| if r.status_code >= 400: | |
| return None, None, None, f"HTTP {r.status_code}: {url}" | |
| html = r.text | |
| meta_title, author, pub_date = extract_meta_from_html(html, url) | |
| title, text = extract_main_text_readability(html) | |
| # Prefer readability title but fall back to meta | |
| final_title = title or meta_title or url | |
| # Fallback if readability is too thin | |
| if not text or len(text) < 500: | |
| soup = BeautifulSoup(html, "html.parser") | |
| raw = "\n".join(p.get_text(" ", strip=True) for p in soup.find_all("p")) | |
| raw = clean_ws(raw) | |
| if len(raw) > len(text): | |
| text = raw | |
| if not text or len(text) < 300: | |
| raw = extract_visible_text(html) | |
| if len(raw) > len(text or ""): | |
| text = raw | |
| if not text or len(text) < 200: | |
| return None, None, None, f"Could not extract sufficient text from: {url}" | |
| return final_title, author, pub_date, text | |
| # ----------------------------- | |
| # PDF extraction (arXiv etc.) | |
| # ----------------------------- | |
| def fetch_pdf_text(session: requests.Session, url: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: | |
| r = safe_get(session, url) | |
| if r.status_code >= 400: | |
| return None, None, None, f"HTTP {r.status_code}: {url}" | |
| # Save temp pdf | |
| os.makedirs(os.path.join(OUT_DIR, "_tmp"), exist_ok=True) | |
| tmp_path = os.path.join(OUT_DIR, "_tmp", f"tmp_{int(time.time()*1000)}.pdf") | |
| with open(tmp_path, "wb") as f: | |
| f.write(r.content) | |
| # Extract text | |
| try: | |
| text = pdfminer_extract_text(tmp_path) or "" | |
| finally: | |
| # remove tmp | |
| try: | |
| os.remove(tmp_path) | |
| except OSError: | |
| pass | |
| text = clean_ws(text) | |
| if not text or len(text) < 800: | |
| return None, None, None, f"PDF text extraction too small for: {url}" | |
| # Title/author/date for arXiv PDFs: best-effort from first page text | |
| # Keep these optional; you can enrich later via arXiv API if you want. | |
| title = "arXiv paper" | |
| author = None | |
| pub_date = None | |
| return title, author, pub_date, text | |
| # ----------------------------- | |
| # Main ingestion | |
| # ----------------------------- | |
| def main(): | |
| os.makedirs(OUT_DIR, exist_ok=True) | |
| session = requests.Session() | |
| session.headers.update(HEADERS) | |
| written = 0 | |
| skipped = [] | |
| manifest_docs = [] | |
| with open(OUT_JSONL, "w", encoding="utf-8") as out: | |
| for src in SOURCES: | |
| doc_id = f"article::{src['id']}" | |
| url = src["url"] | |
| publisher = src.get("publisher") | |
| if src["type"] == "html": | |
| title, author, pub_date, text_or_err = fetch_html_article(session, url) | |
| elif src["type"] == "pdf": | |
| title, author, pub_date, text_or_err = fetch_pdf_text(session, url) | |
| else: | |
| skipped.append({"id": src["id"], "url": url, "reason": f"Unknown type: {src['type']}"}) | |
| continue | |
| if title is None: | |
| skipped.append({"id": src["id"], "url": url, "reason": text_or_err}) | |
| continue | |
| text = text_or_err | |
| chunks = chunk_text(text, size=1200, overlap=150) | |
| if not chunks: | |
| skipped.append({"id": src["id"], "url": url, "reason": "No chunks produced"}) | |
| continue | |
| for i, chunk in enumerate(chunks, 1): | |
| breadcrumbs = f"Article: {title}" | |
| tags = extract_tags(chunk, title) | |
| rec = { | |
| "chunk_id": f"{doc_id}::{i:06d}", | |
| "doc_id": doc_id, | |
| "doc_title": title, | |
| "title": title, | |
| "doc_type": "article", | |
| "publisher": publisher, | |
| "author": author, | |
| "publication_date": pub_date, | |
| "source_url": url, | |
| "section_title": None, | |
| "page_start": None, | |
| "page_end": None, | |
| "source_type": "article", | |
| "date": pub_date, | |
| "url": url, | |
| "priority": 1, | |
| "tags": tags, | |
| "breadcrumbs": breadcrumbs, | |
| "chunk_type": "section", | |
| "text": f"Breadcrumbs: {breadcrumbs}\n{chunk}", | |
| } | |
| out.write(json.dumps(rec, ensure_ascii=False) + "\n") | |
| written += 1 | |
| manifest_docs.append( | |
| { | |
| "id": doc_id, | |
| "title": title, | |
| "format": "pdf" if src["type"] == "pdf" else "html", | |
| "filename": url, | |
| "blocks": len(chunks), | |
| "source_type": "article", | |
| "url": url, | |
| "publisher": publisher, | |
| "author": author, | |
| "publication_date": pub_date, | |
| "date": pub_date, | |
| } | |
| ) | |
| print(f"[OK] {src['id']}: {len(chunks)} chunks") | |
| manifest = { | |
| "generated_at": now_iso(), | |
| "documents": manifest_docs, | |
| } | |
| with open(OUT_MANIFEST, "w", encoding="utf-8") as f: | |
| json.dump(manifest, f, indent=2, ensure_ascii=False) | |
| # Write a small ingestion report | |
| report_path = os.path.join(OUT_DIR, "articles_ingest_report.json") | |
| report = { | |
| "generated_at": now_iso(), | |
| "out_jsonl": OUT_JSONL, | |
| "out_manifest": OUT_MANIFEST, | |
| "total_chunks_written": written, | |
| "sources_total": len(SOURCES), | |
| "sources_skipped": skipped, | |
| "notes": [ | |
| "Towards Data Science links may return 403 and are skipped to keep the pipeline reproducible.", | |
| "arXiv PDFs are ingested via pdfminer; title/author/date may be enriched later.", | |
| ], | |
| } | |
| with open(report_path, "w", encoding="utf-8") as f: | |
| json.dump(report, f, indent=2, ensure_ascii=False) | |
| print(f"[DONE] Wrote {written} chunks to {OUT_JSONL}") | |
| if skipped: | |
| print(f"[WARN] Skipped {len(skipped)} sources. See {report_path}.") | |
| if __name__ == "__main__": | |
| main() | |