File size: 5,614 Bytes
56d43af
5d6391b
 
56d43af
5d6391b
56d43af
5d6391b
 
 
 
 
 
 
 
 
 
 
 
 
56d43af
5d6391b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56d43af
 
5d6391b
 
 
 
 
 
 
56d43af
5d6391b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56d43af
5d6391b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56d43af
5d6391b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python3
"""MEGAMIND HF Space Mind - Lightweight Python Implementation"""
import os, json, sqlite3, hashlib, time, threading, urllib.request, urllib.parse, re
from http.server import HTTPServer, BaseHTTPRequestHandler

PORT = int(os.environ.get('PORT', 7860))
BRAIN_NAME = os.environ.get('BRAIN_NAME', 'HFMind')
BRAIN_DOMAIN = os.environ.get('BRAIN_DOMAIN', 'general')
CRAWLER_TOPICS = [t.strip() for t in os.environ.get('CRAWL_TOPICS', '').split(',') if t.strip()]
MAX_NEURONS = int(os.environ.get('NEURONS', 100000))
DATA_DIR = '/data'

START_TIME = time.time()
os.makedirs(DATA_DIR, exist_ok=True)
DB_PATH = os.path.join(DATA_DIR, 'brain.db')
patterns_count = chunks_count = nonzeros = 0
crawl_queue = []
activity = "initializing"

def init_db():
    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute('CREATE TABLE IF NOT EXISTS chunks (id INTEGER PRIMARY KEY, hash TEXT UNIQUE, content TEXT, source TEXT, created_at INTEGER)')
    c.execute('CREATE TABLE IF NOT EXISTS patterns (id INTEGER PRIMARY KEY, chunk_id INTEGER, neuron_idx INTEGER, weight REAL)')
    conn.commit()
    conn.close()

def get_stats():
    global patterns_count, chunks_count
    try:
        conn = sqlite3.connect(DB_PATH)
        c = conn.cursor()
        c.execute('SELECT COUNT(*) FROM chunks')
        chunks_count = c.fetchone()[0]
        c.execute('SELECT COUNT(*) FROM patterns')
        patterns_count = c.fetchone()[0]
        conn.close()
    except: pass
    return chunks_count, patterns_count

def store_chunk(content, source):
    global nonzeros
    h = hashlib.sha256(content.encode()).hexdigest()[:32]
    try:
        conn = sqlite3.connect(DB_PATH)
        c = conn.cursor()
        c.execute('INSERT OR IGNORE INTO chunks (hash, content, source, created_at) VALUES (?,?,?,?)', (h, content[:10000], source, int(time.time())))
        if c.lastrowid:
            c.execute('INSERT INTO patterns (chunk_id, neuron_idx, weight) VALUES (?,?,?)', (c.lastrowid, hash(h) % MAX_NEURONS, len(content)/10000.0))
            nonzeros += 1
        conn.commit()
        conn.close()
    except: pass

def crawl_url(url):
    global activity
    try:
        activity = f"crawling {url[:40]}..."
        req = urllib.request.Request(url, headers={'User-Agent': 'MEGAMIND-HF/1.0'})
        with urllib.request.urlopen(req, timeout=15) as resp:
            html = resp.read().decode('utf-8', errors='ignore')
        text = re.sub(r'<[^>]+>', ' ', html)
        text = re.sub(r'\s+', ' ', text).strip()
        if len(text) > 100: store_chunk(text[:5000], url)
    except: pass

def crawl_worker():
    global activity
    while True:
        if crawl_queue: crawl_url(crawl_queue.pop(0))
        else:
            activity = "idle - waiting for topics"
            time.sleep(10)
            for topic in CRAWLER_TOPICS[:5]:
                crawl_queue.append(f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(topic)}")

class Handler(BaseHTTPRequestHandler):
    def log_message(self, *a): pass
    def send_json(self, d, c=200):
        self.send_response(c)
        self.send_header('Content-Type', 'application/json')
        self.send_header('Access-Control-Allow-Origin', '*')
        self.end_headers()
        self.wfile.write(json.dumps(d).encode())

    def do_GET(self):
        chunks, patterns = get_stats()
        uptime = time.time() - START_TIME
        if self.path in ['/', '/health']:
            self.send_json({'status': 'healthy', 'name': BRAIN_NAME, 'domain': BRAIN_DOMAIN})
        elif self.path == '/status':
            self.send_json({
                'name': BRAIN_NAME, 'domain': BRAIN_DOMAIN, 'role': 'hf-space-mind',
                'patterns': patterns, 'chunks': chunks, 'neurons': MAX_NEURONS,
                'nonzeros': nonzeros, 'phi': patterns / max(MAX_NEURONS, 1),
                'uptime': f"{uptime/3600:.1f}h", 'uptime_seconds': int(uptime),
                'activity': activity, 'topics': CRAWLER_TOPICS,
                'crawler': {'workers': 3, 'queue': len(crawl_queue)}
            })
        else:
            self.send_json({'error': 'not found'}, 404)

    def do_POST(self):
        length = int(self.headers.get('Content-Length', 0))
        body = self.rfile.read(length).decode() if length else '{}'
        try: data = json.loads(body)
        except: data = {}
        if self.path == '/learn':
            content = data.get('content', '')
            if content:
                store_chunk(content, data.get('source', 'api'))
                self.send_json({'status': 'learned', 'chunks': chunks_count})
            else:
                self.send_json({'error': 'no content'}, 400)
        elif self.path in ['/think', '/query']:
            query = data.get('query', data.get('q', ''))
            chunks, patterns = get_stats()
            self.send_json({
                'name': BRAIN_NAME, 'domain': BRAIN_DOMAIN, 'query': query,
                'response': f"[{BRAIN_NAME}] Knowledge about {BRAIN_DOMAIN}: {chunks} chunks, {patterns} patterns learned.",
                'patterns_matched': min(patterns, 10), 'chunks': chunks
            })
        else:
            self.send_json({'error': 'not found'}, 404)

def main():
    init_db()
    print(f"[{BRAIN_NAME}] Starting HF Space Mind")
    print(f"  Domain: {BRAIN_DOMAIN}")
    print(f"  Topics: {CRAWLER_TOPICS}")
    print(f"  Port: {PORT}")
    for _ in range(3): threading.Thread(target=crawl_worker, daemon=True).start()
    global activity
    activity = "running"
    HTTPServer(('0.0.0.0', PORT), Handler).serve_forever()

if __name__ == '__main__': main()