| import os |
| import duckdb |
| import time |
| from huggingface_hub import hf_hub_download, HfApi |
|
|
| |
| TOKEN = os.environ.get("HF_TOKEN") |
| PASS = "aAsHiSh_Cyb3r_H4CK$=666=DUmp*&~" |
| SOURCE_REPO = "Watchhrr/HITECH_DB" |
| TARGET_REPO = "Watchhrr/HITECH_MASTER_DB" |
| MASTER_DB = "HITECH_FULL_MASTER.db" |
|
|
| api = HfApi(token=TOKEN) |
|
|
| def process_and_merge(): |
| |
| con = duckdb.connect(MASTER_DB) |
| con.execute("CREATE TABLE IF NOT EXISTS master_data (mobile VARCHAR, id VARCHAR, name VARCHAR, address VARCHAR, state VARCHAR)") |
|
|
| |
| for start in range(1, 37, 6): |
| end = min(start + 5, 36) |
| print(f"\nπ Processing Range: {start} to {end}") |
| |
| |
| current_parts = [] |
| for i in range(start, end + 1): |
| part_name = f"Hi-Tek-DB.zip.{str(i).zfill(3)}" |
| print(f"π₯ Downloading {part_name}...") |
| hf_hub_download(repo_id=SOURCE_REPO, filename=part_name, repo_type="dataset", token=TOKEN, local_dir=".") |
| current_parts.append(part_name) |
|
|
| |
| print("π Merging splits...") |
| os.system(f"cat Hi-Tek-DB.zip.* > temp_chunk.zip") |
| os.system("rm Hi-Tek-DB.zip.*") |
| |
| print("β‘ Extracting with Password...") |
| os.system(f"7z e temp_chunk.zip -p'{PASS}' -y -o./temp_data") |
| os.remove("temp_chunk.zip") |
|
|
| |
| print("π Injecting into Master Table...") |
| con.execute("INSERT INTO master_data SELECT * FROM read_csv_auto('./temp_data/*.csv', ignore_errors=true)") |
| |
| |
| os.system("rm -rf ./temp_data") |
| print(f"β
Range {start}-{end} merged into Master.") |
|
|
| |
| print("\nπ Creating Turbo Search Index...") |
| con.execute("CREATE INDEX idx_mobile ON master_data (mobile)") |
| con.commit() |
| con.close() |
|
|
| |
| print("βοΈ Uploading FULL MASTER DB to Hugging Face...") |
| api.upload_file(path_or_fileobj=MASTER_DB, path_in_repo=MASTER_DB, repo_id=TARGET_REPO, repo_type="dataset") |
|
|
| |
| try: |
| process_and_merge() |
| print("π MISSION COMPLETE! Ek hi master file ban gayi.") |
| except Exception as e: |
| print(f"π¨ Final Error: {e}") |
|
|
| while True: time.sleep(3600) |
|
|