File size: 3,299 Bytes
edae06c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

import os
import ast
import zlib
import json
import logging
from logos.logos_core import get_gpf
from logos.manifold_state import ManifoldState

logger = logging.getLogger("TokenizerAgent")

class TokenizerAgent:
    """
    Protocol 6: Tokenizer Agent
    Parses a repository (Source) into Prime Tokens (Domain Potentiality Space).
    """
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.manifold = ManifoldState() # Connects to logos/manifold.json

    def scan_and_tokenize(self):
        """Scans root_dir and tokenizes all supported files."""
        tokens = []
        for root, dirs, files in os.walk(self.root_dir):
            # Skip hidden/system dirs
            dirs[:] = [d for d in dirs if not d.startswith('.') and not d.startswith('__')]
            
            for file in files:
                if file.endswith(".py") or file.endswith(".md"):
                    path = os.path.join(root, file)
                    token = self._tokenize_file(path)
                    if token:
                        tokens.append(token)
        
        self._register_tokens_to_manifold(tokens)
        return tokens

    def _tokenize_file(self, filepath):
        """Parses a single file into a Prime Token."""
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()

            rel_path = os.path.relpath(filepath, self.root_dir)
            
            # --- Prime Topology Analysis ---
            # 1. Hash Content -> Integer Field
            h = zlib.adler32(content.encode('utf-8'))
            
            # 2. Calculate Resonance (GPF)
            gpf = get_gpf(h)
            
            # 3. Determine Domain
            if gpf < 200:
                domain = "INNER_SHELL"
            elif gpf < 2000:
                domain = "PRIME_CHANNEL"
            else:
                domain = "OUTER_SHELL"

            token = {
                "id": h,
                "name": os.path.basename(filepath),
                "path": rel_path,
                "type": "file",
                "geometry": {
                    "hash": h,
                    "gpf": gpf,
                    "domain": domain
                },
                "content_preview": content[:100]
            }
            return token

        except Exception as e:
            logger.error(f"Failed to tokenize {filepath}: {e}")
            return None

    def _register_tokens_to_manifold(self, tokens):
        """Updates the physical Manifold State with new tokens."""
        # Ensure manifold state has a graph structure
        if "graph" not in self.manifold.state:
            self.manifold.state["graph"] = {"nodes": [], "edges": []}
            
        # Add new nodes (deduplicated by ID)
        existing_ids = {n["id"] for n in self.manifold.state["graph"]["nodes"]}
        for t in tokens:
            if t["id"] not in existing_ids:
                self.manifold.state["graph"]["nodes"].append(t)
                
        self.manifold.save()
        logger.info(f"Registered {len(tokens)} tokens to Manifold.")

if __name__ == "__main__":
    # Test Run
    agent = TokenizerAgent(".")
    print("Tokenizing current directory...")
    agent.scan_and_tokenize()
    print("Done.")