Vitalis_Devcore / src /devcore /data_pipeline.py
FerrellSyntheticIntelligence
Initial clean commit: Source code only
29cdc9d
raw
history blame
2.58 kB
import os
import shutil
import subprocess
import json
from src.devcore.syntax_ingester import SyntaxIngester
class DeveloperDataPipeline:
def __init__(self, target_dir="src/devcore/training_data"):
self.target_dir = target_dir
self.ingester = SyntaxIngester()
os.makedirs(self.target_dir, exist_ok=True)
def fetch_and_ingest(self, repo_url, project_name):
"""
Surgically clones a repository, extracts the abstract syntax trees,
and purges the raw files to conserve disk space.
"""
print(f"\n[=================================================]")
print(f"[+] Initializing target acquisition: {project_name}")
print(f"[=================================================]")
clone_path = os.path.join(self.target_dir, project_name)
# Step 1: Securely clone the target repository
print(f"[*] Cloning source data from {repo_url}...")
try:
subprocess.run(
["git", "clone", "--depth", "1", repo_url, clone_path],
check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
)
except subprocess.CalledProcessError:
print("[-] Target acquisition failed. Check network or URL.")
return
# Step 2: Run the Syntax Ingester over the codebase
print("[*] Engaging Syntax Ingester to extract logic structures...")
structural_profiles = self.ingester.process_directory(clone_path)
# Step 3: Compile the Hebbian Training Data
output_file = os.path.join(self.target_dir, f"{project_name}_ast_profile.json")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(structural_profiles, f, indent=2)
print(f"[+] Structural logic mapped successfully. Profile contains {len(structural_profiles)} modules.")
print(f"[+] Hebbian training payload saved to: {output_file}")
# Step 4: Tactical Cleanup (Finesse Storage Management)
print("[*] Purging raw repository files to conserve system partition space...")
shutil.rmtree(clone_path)
print("[+] Operation complete. System storage secured.")
if __name__ == "__main__":
pipeline = DeveloperDataPipeline()
# For our first live target, we will ingest an ultra-clean, minimalist framework
# to teach the model highly efficient routing and logic constraints.
target_repo = "https://github.com/pallets/flask.git"
pipeline.fetch_and_ingest(target_repo, "flask_core_logic")