File size: 2,577 Bytes

29cdc9d

import os
import shutil
import subprocess
import json
from src.devcore.syntax_ingester import SyntaxIngester

class DeveloperDataPipeline:
    def __init__(self, target_dir="src/devcore/training_data"):
        self.target_dir = target_dir
        self.ingester = SyntaxIngester()
        os.makedirs(self.target_dir, exist_ok=True)

    def fetch_and_ingest(self, repo_url, project_name):
        """
        Surgically clones a repository, extracts the abstract syntax trees,
        and purges the raw files to conserve disk space.
        """
        print(f"\n[=================================================]")
        print(f"[+] Initializing target acquisition: {project_name}")
        print(f"[=================================================]")
        
        clone_path = os.path.join(self.target_dir, project_name)
        
        # Step 1: Securely clone the target repository
        print(f"[*] Cloning source data from {repo_url}...")
        try:
            subprocess.run(
                ["git", "clone", "--depth", "1", repo_url, clone_path],
                check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
            )
        except subprocess.CalledProcessError:
            print("[-] Target acquisition failed. Check network or URL.")
            return

        # Step 2: Run the Syntax Ingester over the codebase
        print("[*] Engaging Syntax Ingester to extract logic structures...")
        structural_profiles = self.ingester.process_directory(clone_path)
        
        # Step 3: Compile the Hebbian Training Data
        output_file = os.path.join(self.target_dir, f"{project_name}_ast_profile.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(structural_profiles, f, indent=2)
            
        print(f"[+] Structural logic mapped successfully. Profile contains {len(structural_profiles)} modules.")
        print(f"[+] Hebbian training payload saved to: {output_file}")

        # Step 4: Tactical Cleanup (Finesse Storage Management)
        print("[*] Purging raw repository files to conserve system partition space...")
        shutil.rmtree(clone_path)
        print("[+] Operation complete. System storage secured.")

if __name__ == "__main__":
    pipeline = DeveloperDataPipeline()
    # For our first live target, we will ingest an ultra-clean, minimalist framework 
    # to teach the model highly efficient routing and logic constraints.
    target_repo = "https://github.com/pallets/flask.git"
    pipeline.fetch_and_ingest(target_repo, "flask_core_logic")