Spaces:
Sleeping
Sleeping
| """ | |
| KIA Main Pipeline | |
| ==================== | |
| Orchestrates the full data collection, processing, and dataset creation pipeline. | |
| Usage: | |
| python main.py scrape # Run all scrapers | |
| python main.py process # Process raw data into dataset | |
| python main.py upload # Upload dataset to HuggingFace | |
| python main.py all # Run everything | |
| python main.py stats # Show statistics | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import glob | |
| import time | |
| import logging | |
| from pathlib import Path | |
| # Setup logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", | |
| handlers=[ | |
| logging.StreamHandler(), | |
| logging.FileHandler("shp_ai.log", encoding="utf-8"), | |
| ] | |
| ) | |
| logger = logging.getLogger("KIA") | |
| def run_scrapers(): | |
| """Run all scrapers to collect raw data.""" | |
| from scraper.config import MOD_CONFIG, AAF_CONFIG | |
| from scraper.base_crawler import BaseCrawler | |
| from scraper.wiki_scraper import WikipediaScraper | |
| from scraper.pdf_extractor import PDFExtractor | |
| all_stats = [] | |
| start_time = time.time() | |
| print("\n" + "=" * 60) | |
| print(" KIA - Data Collection Pipeline") | |
| print("=" * 60 + "\n") | |
| # 1. Ministry of Defense | |
| print(" [1/4] Crawling: Ministria e Mbrojtjes (mod.gov.al)...") | |
| try: | |
| mod_crawler = BaseCrawler( | |
| name=MOD_CONFIG["name"], | |
| base_url=MOD_CONFIG["base_url"], | |
| start_urls=MOD_CONFIG["start_urls"], | |
| allowed_domains=MOD_CONFIG["allowed_domains"], | |
| output_dir=MOD_CONFIG["output_dir"], | |
| ) | |
| mod_crawler.crawl() | |
| all_stats.append(mod_crawler.get_stats()) | |
| except Exception as e: | |
| logger.error(f"MOD crawler failed: {e}") | |
| # 2. Armed Forces | |
| print("\n [2/4] Crawling: Forcat e Armatosura (aaf.mil.al)...") | |
| try: | |
| aaf_crawler = BaseCrawler( | |
| name=AAF_CONFIG["name"], | |
| base_url=AAF_CONFIG["base_url"], | |
| start_urls=AAF_CONFIG["start_urls"], | |
| allowed_domains=AAF_CONFIG["allowed_domains"], | |
| output_dir=AAF_CONFIG["output_dir"], | |
| ) | |
| aaf_crawler.crawl() | |
| all_stats.append(aaf_crawler.get_stats()) | |
| except Exception as e: | |
| logger.error(f"AAF crawler failed: {e}") | |
| # 3. Wikipedia | |
| print("\n [3/4] Scraping: Wikipedia (SQ + EN)...") | |
| try: | |
| wiki_scraper = WikipediaScraper() | |
| wiki_scraper.crawl() | |
| all_stats.append(wiki_scraper.get_stats()) | |
| except Exception as e: | |
| logger.error(f"Wikipedia scraper failed: {e}") | |
| # 4. PDFs (if any were downloaded) | |
| print("\n [4/4] Extracting: PDF Documents...") | |
| try: | |
| pdf_extractor = PDFExtractor() | |
| pdf_extractor.extract_all() | |
| all_stats.append(pdf_extractor.get_stats()) | |
| except Exception as e: | |
| logger.error(f"PDF extractor failed: {e}") | |
| elapsed = time.time() - start_time | |
| # Print summary | |
| print("\n" + "=" * 60) | |
| print(" SCRAPING COMPLETE") | |
| print("=" * 60) | |
| print(f"\nTotal time: {elapsed:.1f}s ({elapsed/60:.1f} min)") | |
| for stat in all_stats: | |
| print(f"\nStats for {stat.get('source', 'Unknown')}:") | |
| for key, value in stat.items(): | |
| if key != 'source': | |
| print(f" {key}: {value}") | |
| return all_stats | |
| def run_processing(): | |
| """Process raw data into instruction dataset.""" | |
| from scraper.config import RAW_DIR, CLEANED_DIR | |
| from processing.cleaner import TextCleaner | |
| from processing.chunker import TextChunker | |
| from processing.dataset_builder import DatasetBuilder | |
| print("\n" + "=" * 60) | |
| print(" KIA - Data Processing Pipeline") | |
| print("=" * 60 + "\n") | |
| # 1. Load all raw JSON files | |
| print(" Loading raw data...") | |
| documents = [] | |
| json_files = glob.glob(os.path.join(RAW_DIR, "**", "*.json"), recursive=True) | |
| for filepath in json_files: | |
| try: | |
| with open(filepath, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| if data.get("text") and len(data.get("text", "")) > 100: | |
| documents.append(data) | |
| except Exception as e: | |
| logger.warning(f"Failed to load {filepath}: {e}") | |
| print(f" Loaded {len(documents)} documents") | |
| if not documents: | |
| print(" No documents found! Run 'python main.py scrape' first.") | |
| return | |
| # 2. Clean text | |
| print("\n Cleaning text...") | |
| cleaner = TextCleaner() | |
| cleaned_docs = [] | |
| for doc in documents: | |
| cleaned_text = cleaner.clean_text(doc.get("text", "")) | |
| if cleaner.is_quality_content(cleaned_text): | |
| doc["text"] = cleaned_text | |
| doc["text_length"] = len(cleaned_text) | |
| cleaned_docs.append(doc) | |
| print(f" Quality documents: {len(cleaned_docs)} / {len(documents)}") | |
| # Save cleaned documents | |
| for doc in cleaned_docs: | |
| source = doc.get("source", "unknown").lower().replace(" ", "_")[:20] | |
| filename = f"cleaned_{source}_{hash(doc.get('url', '')) % 10000:04d}.json" | |
| filepath = os.path.join(CLEANED_DIR, filename) | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| json.dump(doc, f, ensure_ascii=False, indent=2) | |
| # 3. Chunk text | |
| print("\n Chunking text...") | |
| chunker = TextChunker(chunk_size=1200, overlap=100, min_chunk_size=200) | |
| chunks = chunker.chunk_all_documents(cleaned_docs) | |
| print(f" Total chunks: {len(chunks)}") | |
| # 4. Build dataset | |
| print("\n Building instruction dataset...") | |
| builder = DatasetBuilder() | |
| builder.build_dataset(chunks) | |
| # 5. Save dataset | |
| print("\n Saving dataset...") | |
| builder.save_dataset(train_ratio=0.9) | |
| stats = builder.get_stats() | |
| print(f"\n Dataset built: {stats['total_examples']} examples") | |
| return stats | |
| def run_upload(): | |
| """Upload dataset to HuggingFace.""" | |
| from upload.hf_upload import HFUploader | |
| print("\n" + "=" * 60) | |
| print(" KIA - HuggingFace Upload") | |
| print("=" * 60 + "\n") | |
| uploader = HFUploader() | |
| if not uploader.username: | |
| print(" Failed to authenticate with HuggingFace!") | |
| return | |
| repo = uploader.upload_dataset() | |
| print(f"\n Dataset live at: https://huggingface.co/datasets/{repo}") | |
| def show_stats(): | |
| """Show current data statistics.""" | |
| from scraper.config import RAW_DIR, CLEANED_DIR, DATASET_DIR | |
| print("\n" + "=" * 60) | |
| print(" KIA - Statistics") | |
| print("=" * 60) | |
| # Raw data | |
| raw_files = glob.glob(os.path.join(RAW_DIR, "**", "*.json"), recursive=True) | |
| raw_chars = 0 | |
| for f in raw_files: | |
| try: | |
| with open(f, "r", encoding="utf-8") as fh: | |
| data = json.load(fh) | |
| raw_chars += len(data.get("text", "")) | |
| except: | |
| pass | |
| print(f"\n Raw Data:") | |
| print(f" Files: {len(raw_files)}") | |
| print(f" Total chars: {raw_chars:,}") | |
| print(f" Estimated words: {raw_chars // 5:,}") | |
| # Dataset | |
| train_path = os.path.join(DATASET_DIR, "train.jsonl") | |
| val_path = os.path.join(DATASET_DIR, "validation.jsonl") | |
| train_count = 0 | |
| if os.path.exists(train_path): | |
| with open(train_path, "r", encoding="utf-8") as f: | |
| train_count = sum(1 for _ in f) | |
| val_count = 0 | |
| if os.path.exists(val_path): | |
| with open(val_path, "r", encoding="utf-8") as f: | |
| val_count = sum(1 for _ in f) | |
| print(f"\n Dataset:") | |
| print(f" Train examples: {train_count}") | |
| print(f" Validation examples: {val_count}") | |
| print(f" Total: {train_count + val_count}") | |
| # By source | |
| print(f"\n By Source:") | |
| for subdir in os.listdir(RAW_DIR): | |
| subdir_path = os.path.join(RAW_DIR, subdir) | |
| if os.path.isdir(subdir_path): | |
| count = len([f for f in os.listdir(subdir_path) if f.endswith('.json')]) | |
| print(f" {subdir}: {count} files") | |
| def run_serve(): | |
| """Launch the modern Vite + FastAPI UI.""" | |
| import subprocess | |
| import os | |
| import sys | |
| # Needs to be able to find app.api | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| print("\n" + "=" * 60) | |
| print(" KIA - Booting Digital Command Center") | |
| print("=" * 60 + "\n") | |
| cwd = os.path.dirname(os.path.abspath(__file__)) | |
| frontend_dir = os.path.join(cwd, "frontend") | |
| backend = subprocess.Popen(["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "8001"], cwd=cwd, shell=True) | |
| frontend = subprocess.Popen(["npx", "vite", "--host"], cwd=frontend_dir, shell=True) | |
| try: | |
| backend.wait() | |
| frontend.wait() | |
| except KeyboardInterrupt: | |
| print("\nArresting UI processes...") | |
| backend.terminate() | |
| frontend.terminate() | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: python main.py [scrape|process|upload|all|stats|serve]") | |
| print() | |
| print("Commands:") | |
| print(" scrape - Crawl/scrape all data sources") | |
| print(" process - Process raw data into dataset") | |
| print(" upload - Upload dataset to HuggingFace") | |
| print(" all - Run scrape -> process -> upload") | |
| print(" stats - Show current data statistics") | |
| print(" serve - Launch modern 3D UI and FastAPI server") | |
| return | |
| command = sys.argv[1].lower() | |
| if command == "scrape": | |
| run_scrapers() | |
| elif command == "process": | |
| run_processing() | |
| elif command == "upload": | |
| run_upload() | |
| elif command == "all": | |
| run_scrapers() | |
| run_processing() | |
| run_upload() | |
| elif command == "stats": | |
| show_stats() | |
| elif command == "serve": | |
| run_serve() | |
| else: | |
| print(f"Unknown command: {command}") | |
| print("Use: scrape, process, upload, all, stats, serve") | |
| if __name__ == "__main__": | |
| main() | |