""" KIA Main Pipeline ==================== Orchestrates the full data collection, processing, and dataset creation pipeline. Usage: python main.py scrape # Run all scrapers python main.py process # Process raw data into dataset python main.py upload # Upload dataset to HuggingFace python main.py all # Run everything python main.py stats # Show statistics """ import os import sys import json import glob import time import logging from pathlib import Path # Setup logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", handlers=[ logging.StreamHandler(), logging.FileHandler("shp_ai.log", encoding="utf-8"), ] ) logger = logging.getLogger("KIA") def run_scrapers(): """Run all scrapers to collect raw data.""" from scraper.config import MOD_CONFIG, AAF_CONFIG from scraper.base_crawler import BaseCrawler from scraper.wiki_scraper import WikipediaScraper from scraper.pdf_extractor import PDFExtractor all_stats = [] start_time = time.time() print("\n" + "=" * 60) print(" KIA - Data Collection Pipeline") print("=" * 60 + "\n") # 1. Ministry of Defense print(" [1/4] Crawling: Ministria e Mbrojtjes (mod.gov.al)...") try: mod_crawler = BaseCrawler( name=MOD_CONFIG["name"], base_url=MOD_CONFIG["base_url"], start_urls=MOD_CONFIG["start_urls"], allowed_domains=MOD_CONFIG["allowed_domains"], output_dir=MOD_CONFIG["output_dir"], ) mod_crawler.crawl() all_stats.append(mod_crawler.get_stats()) except Exception as e: logger.error(f"MOD crawler failed: {e}") # 2. Armed Forces print("\n [2/4] Crawling: Forcat e Armatosura (aaf.mil.al)...") try: aaf_crawler = BaseCrawler( name=AAF_CONFIG["name"], base_url=AAF_CONFIG["base_url"], start_urls=AAF_CONFIG["start_urls"], allowed_domains=AAF_CONFIG["allowed_domains"], output_dir=AAF_CONFIG["output_dir"], ) aaf_crawler.crawl() all_stats.append(aaf_crawler.get_stats()) except Exception as e: logger.error(f"AAF crawler failed: {e}") # 3. Wikipedia print("\n [3/4] Scraping: Wikipedia (SQ + EN)...") try: wiki_scraper = WikipediaScraper() wiki_scraper.crawl() all_stats.append(wiki_scraper.get_stats()) except Exception as e: logger.error(f"Wikipedia scraper failed: {e}") # 4. PDFs (if any were downloaded) print("\n [4/4] Extracting: PDF Documents...") try: pdf_extractor = PDFExtractor() pdf_extractor.extract_all() all_stats.append(pdf_extractor.get_stats()) except Exception as e: logger.error(f"PDF extractor failed: {e}") elapsed = time.time() - start_time # Print summary print("\n" + "=" * 60) print(" SCRAPING COMPLETE") print("=" * 60) print(f"\nTotal time: {elapsed:.1f}s ({elapsed/60:.1f} min)") for stat in all_stats: print(f"\nStats for {stat.get('source', 'Unknown')}:") for key, value in stat.items(): if key != 'source': print(f" {key}: {value}") return all_stats def run_processing(): """Process raw data into instruction dataset.""" from scraper.config import RAW_DIR, CLEANED_DIR from processing.cleaner import TextCleaner from processing.chunker import TextChunker from processing.dataset_builder import DatasetBuilder print("\n" + "=" * 60) print(" KIA - Data Processing Pipeline") print("=" * 60 + "\n") # 1. Load all raw JSON files print(" Loading raw data...") documents = [] json_files = glob.glob(os.path.join(RAW_DIR, "**", "*.json"), recursive=True) for filepath in json_files: try: with open(filepath, "r", encoding="utf-8") as f: data = json.load(f) if data.get("text") and len(data.get("text", "")) > 100: documents.append(data) except Exception as e: logger.warning(f"Failed to load {filepath}: {e}") print(f" Loaded {len(documents)} documents") if not documents: print(" No documents found! Run 'python main.py scrape' first.") return # 2. Clean text print("\n Cleaning text...") cleaner = TextCleaner() cleaned_docs = [] for doc in documents: cleaned_text = cleaner.clean_text(doc.get("text", "")) if cleaner.is_quality_content(cleaned_text): doc["text"] = cleaned_text doc["text_length"] = len(cleaned_text) cleaned_docs.append(doc) print(f" Quality documents: {len(cleaned_docs)} / {len(documents)}") # Save cleaned documents for doc in cleaned_docs: source = doc.get("source", "unknown").lower().replace(" ", "_")[:20] filename = f"cleaned_{source}_{hash(doc.get('url', '')) % 10000:04d}.json" filepath = os.path.join(CLEANED_DIR, filename) with open(filepath, "w", encoding="utf-8") as f: json.dump(doc, f, ensure_ascii=False, indent=2) # 3. Chunk text print("\n Chunking text...") chunker = TextChunker(chunk_size=1200, overlap=100, min_chunk_size=200) chunks = chunker.chunk_all_documents(cleaned_docs) print(f" Total chunks: {len(chunks)}") # 4. Build dataset print("\n Building instruction dataset...") builder = DatasetBuilder() builder.build_dataset(chunks) # 5. Save dataset print("\n Saving dataset...") builder.save_dataset(train_ratio=0.9) stats = builder.get_stats() print(f"\n Dataset built: {stats['total_examples']} examples") return stats def run_upload(): """Upload dataset to HuggingFace.""" from upload.hf_upload import HFUploader print("\n" + "=" * 60) print(" KIA - HuggingFace Upload") print("=" * 60 + "\n") uploader = HFUploader() if not uploader.username: print(" Failed to authenticate with HuggingFace!") return repo = uploader.upload_dataset() print(f"\n Dataset live at: https://huggingface.co/datasets/{repo}") def show_stats(): """Show current data statistics.""" from scraper.config import RAW_DIR, CLEANED_DIR, DATASET_DIR print("\n" + "=" * 60) print(" KIA - Statistics") print("=" * 60) # Raw data raw_files = glob.glob(os.path.join(RAW_DIR, "**", "*.json"), recursive=True) raw_chars = 0 for f in raw_files: try: with open(f, "r", encoding="utf-8") as fh: data = json.load(fh) raw_chars += len(data.get("text", "")) except: pass print(f"\n Raw Data:") print(f" Files: {len(raw_files)}") print(f" Total chars: {raw_chars:,}") print(f" Estimated words: {raw_chars // 5:,}") # Dataset train_path = os.path.join(DATASET_DIR, "train.jsonl") val_path = os.path.join(DATASET_DIR, "validation.jsonl") train_count = 0 if os.path.exists(train_path): with open(train_path, "r", encoding="utf-8") as f: train_count = sum(1 for _ in f) val_count = 0 if os.path.exists(val_path): with open(val_path, "r", encoding="utf-8") as f: val_count = sum(1 for _ in f) print(f"\n Dataset:") print(f" Train examples: {train_count}") print(f" Validation examples: {val_count}") print(f" Total: {train_count + val_count}") # By source print(f"\n By Source:") for subdir in os.listdir(RAW_DIR): subdir_path = os.path.join(RAW_DIR, subdir) if os.path.isdir(subdir_path): count = len([f for f in os.listdir(subdir_path) if f.endswith('.json')]) print(f" {subdir}: {count} files") def run_serve(): """Launch the modern Vite + FastAPI UI.""" import subprocess import os import sys # Needs to be able to find app.api sys.path.append(os.path.dirname(os.path.abspath(__file__))) print("\n" + "=" * 60) print(" KIA - Booting Digital Command Center") print("=" * 60 + "\n") cwd = os.path.dirname(os.path.abspath(__file__)) frontend_dir = os.path.join(cwd, "frontend") backend = subprocess.Popen(["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "8001"], cwd=cwd, shell=True) frontend = subprocess.Popen(["npx", "vite", "--host"], cwd=frontend_dir, shell=True) try: backend.wait() frontend.wait() except KeyboardInterrupt: print("\nArresting UI processes...") backend.terminate() frontend.terminate() def main(): if len(sys.argv) < 2: print("Usage: python main.py [scrape|process|upload|all|stats|serve]") print() print("Commands:") print(" scrape - Crawl/scrape all data sources") print(" process - Process raw data into dataset") print(" upload - Upload dataset to HuggingFace") print(" all - Run scrape -> process -> upload") print(" stats - Show current data statistics") print(" serve - Launch modern 3D UI and FastAPI server") return command = sys.argv[1].lower() if command == "scrape": run_scrapers() elif command == "process": run_processing() elif command == "upload": run_upload() elif command == "all": run_scrapers() run_processing() run_upload() elif command == "stats": show_stats() elif command == "serve": run_serve() else: print(f"Unknown command: {command}") print("Use: scrape, process, upload, all, stats, serve") if __name__ == "__main__": main()