kiafa's picture
Premium UI/UX Overhaul & Optimization Update
b96f3a5 verified
"""
KIA Main Pipeline
====================
Orchestrates the full data collection, processing, and dataset creation pipeline.
Usage:
python main.py scrape # Run all scrapers
python main.py process # Process raw data into dataset
python main.py upload # Upload dataset to HuggingFace
python main.py all # Run everything
python main.py stats # Show statistics
"""
import os
import sys
import json
import glob
import time
import logging
from pathlib import Path
# Setup logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[
logging.StreamHandler(),
logging.FileHandler("shp_ai.log", encoding="utf-8"),
]
)
logger = logging.getLogger("KIA")
def run_scrapers():
"""Run all scrapers to collect raw data."""
from scraper.config import MOD_CONFIG, AAF_CONFIG
from scraper.base_crawler import BaseCrawler
from scraper.wiki_scraper import WikipediaScraper
from scraper.pdf_extractor import PDFExtractor
all_stats = []
start_time = time.time()
print("\n" + "=" * 60)
print(" KIA - Data Collection Pipeline")
print("=" * 60 + "\n")
# 1. Ministry of Defense
print(" [1/4] Crawling: Ministria e Mbrojtjes (mod.gov.al)...")
try:
mod_crawler = BaseCrawler(
name=MOD_CONFIG["name"],
base_url=MOD_CONFIG["base_url"],
start_urls=MOD_CONFIG["start_urls"],
allowed_domains=MOD_CONFIG["allowed_domains"],
output_dir=MOD_CONFIG["output_dir"],
)
mod_crawler.crawl()
all_stats.append(mod_crawler.get_stats())
except Exception as e:
logger.error(f"MOD crawler failed: {e}")
# 2. Armed Forces
print("\n [2/4] Crawling: Forcat e Armatosura (aaf.mil.al)...")
try:
aaf_crawler = BaseCrawler(
name=AAF_CONFIG["name"],
base_url=AAF_CONFIG["base_url"],
start_urls=AAF_CONFIG["start_urls"],
allowed_domains=AAF_CONFIG["allowed_domains"],
output_dir=AAF_CONFIG["output_dir"],
)
aaf_crawler.crawl()
all_stats.append(aaf_crawler.get_stats())
except Exception as e:
logger.error(f"AAF crawler failed: {e}")
# 3. Wikipedia
print("\n [3/4] Scraping: Wikipedia (SQ + EN)...")
try:
wiki_scraper = WikipediaScraper()
wiki_scraper.crawl()
all_stats.append(wiki_scraper.get_stats())
except Exception as e:
logger.error(f"Wikipedia scraper failed: {e}")
# 4. PDFs (if any were downloaded)
print("\n [4/4] Extracting: PDF Documents...")
try:
pdf_extractor = PDFExtractor()
pdf_extractor.extract_all()
all_stats.append(pdf_extractor.get_stats())
except Exception as e:
logger.error(f"PDF extractor failed: {e}")
elapsed = time.time() - start_time
# Print summary
print("\n" + "=" * 60)
print(" SCRAPING COMPLETE")
print("=" * 60)
print(f"\nTotal time: {elapsed:.1f}s ({elapsed/60:.1f} min)")
for stat in all_stats:
print(f"\nStats for {stat.get('source', 'Unknown')}:")
for key, value in stat.items():
if key != 'source':
print(f" {key}: {value}")
return all_stats
def run_processing():
"""Process raw data into instruction dataset."""
from scraper.config import RAW_DIR, CLEANED_DIR
from processing.cleaner import TextCleaner
from processing.chunker import TextChunker
from processing.dataset_builder import DatasetBuilder
print("\n" + "=" * 60)
print(" KIA - Data Processing Pipeline")
print("=" * 60 + "\n")
# 1. Load all raw JSON files
print(" Loading raw data...")
documents = []
json_files = glob.glob(os.path.join(RAW_DIR, "**", "*.json"), recursive=True)
for filepath in json_files:
try:
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
if data.get("text") and len(data.get("text", "")) > 100:
documents.append(data)
except Exception as e:
logger.warning(f"Failed to load {filepath}: {e}")
print(f" Loaded {len(documents)} documents")
if not documents:
print(" No documents found! Run 'python main.py scrape' first.")
return
# 2. Clean text
print("\n Cleaning text...")
cleaner = TextCleaner()
cleaned_docs = []
for doc in documents:
cleaned_text = cleaner.clean_text(doc.get("text", ""))
if cleaner.is_quality_content(cleaned_text):
doc["text"] = cleaned_text
doc["text_length"] = len(cleaned_text)
cleaned_docs.append(doc)
print(f" Quality documents: {len(cleaned_docs)} / {len(documents)}")
# Save cleaned documents
for doc in cleaned_docs:
source = doc.get("source", "unknown").lower().replace(" ", "_")[:20]
filename = f"cleaned_{source}_{hash(doc.get('url', '')) % 10000:04d}.json"
filepath = os.path.join(CLEANED_DIR, filename)
with open(filepath, "w", encoding="utf-8") as f:
json.dump(doc, f, ensure_ascii=False, indent=2)
# 3. Chunk text
print("\n Chunking text...")
chunker = TextChunker(chunk_size=1200, overlap=100, min_chunk_size=200)
chunks = chunker.chunk_all_documents(cleaned_docs)
print(f" Total chunks: {len(chunks)}")
# 4. Build dataset
print("\n Building instruction dataset...")
builder = DatasetBuilder()
builder.build_dataset(chunks)
# 5. Save dataset
print("\n Saving dataset...")
builder.save_dataset(train_ratio=0.9)
stats = builder.get_stats()
print(f"\n Dataset built: {stats['total_examples']} examples")
return stats
def run_upload():
"""Upload dataset to HuggingFace."""
from upload.hf_upload import HFUploader
print("\n" + "=" * 60)
print(" KIA - HuggingFace Upload")
print("=" * 60 + "\n")
uploader = HFUploader()
if not uploader.username:
print(" Failed to authenticate with HuggingFace!")
return
repo = uploader.upload_dataset()
print(f"\n Dataset live at: https://huggingface.co/datasets/{repo}")
def show_stats():
"""Show current data statistics."""
from scraper.config import RAW_DIR, CLEANED_DIR, DATASET_DIR
print("\n" + "=" * 60)
print(" KIA - Statistics")
print("=" * 60)
# Raw data
raw_files = glob.glob(os.path.join(RAW_DIR, "**", "*.json"), recursive=True)
raw_chars = 0
for f in raw_files:
try:
with open(f, "r", encoding="utf-8") as fh:
data = json.load(fh)
raw_chars += len(data.get("text", ""))
except:
pass
print(f"\n Raw Data:")
print(f" Files: {len(raw_files)}")
print(f" Total chars: {raw_chars:,}")
print(f" Estimated words: {raw_chars // 5:,}")
# Dataset
train_path = os.path.join(DATASET_DIR, "train.jsonl")
val_path = os.path.join(DATASET_DIR, "validation.jsonl")
train_count = 0
if os.path.exists(train_path):
with open(train_path, "r", encoding="utf-8") as f:
train_count = sum(1 for _ in f)
val_count = 0
if os.path.exists(val_path):
with open(val_path, "r", encoding="utf-8") as f:
val_count = sum(1 for _ in f)
print(f"\n Dataset:")
print(f" Train examples: {train_count}")
print(f" Validation examples: {val_count}")
print(f" Total: {train_count + val_count}")
# By source
print(f"\n By Source:")
for subdir in os.listdir(RAW_DIR):
subdir_path = os.path.join(RAW_DIR, subdir)
if os.path.isdir(subdir_path):
count = len([f for f in os.listdir(subdir_path) if f.endswith('.json')])
print(f" {subdir}: {count} files")
def run_serve():
"""Launch the modern Vite + FastAPI UI."""
import subprocess
import os
import sys
# Needs to be able to find app.api
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
print("\n" + "=" * 60)
print(" KIA - Booting Digital Command Center")
print("=" * 60 + "\n")
cwd = os.path.dirname(os.path.abspath(__file__))
frontend_dir = os.path.join(cwd, "frontend")
backend = subprocess.Popen(["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "8001"], cwd=cwd, shell=True)
frontend = subprocess.Popen(["npx", "vite", "--host"], cwd=frontend_dir, shell=True)
try:
backend.wait()
frontend.wait()
except KeyboardInterrupt:
print("\nArresting UI processes...")
backend.terminate()
frontend.terminate()
def main():
if len(sys.argv) < 2:
print("Usage: python main.py [scrape|process|upload|all|stats|serve]")
print()
print("Commands:")
print(" scrape - Crawl/scrape all data sources")
print(" process - Process raw data into dataset")
print(" upload - Upload dataset to HuggingFace")
print(" all - Run scrape -> process -> upload")
print(" stats - Show current data statistics")
print(" serve - Launch modern 3D UI and FastAPI server")
return
command = sys.argv[1].lower()
if command == "scrape":
run_scrapers()
elif command == "process":
run_processing()
elif command == "upload":
run_upload()
elif command == "all":
run_scrapers()
run_processing()
run_upload()
elif command == "stats":
show_stats()
elif command == "serve":
run_serve()
else:
print(f"Unknown command: {command}")
print("Use: scrape, process, upload, all, stats, serve")
if __name__ == "__main__":
main()