XHS / orchestrator /material_builder.py
Trae Bot
Upload Spider_XHS project
c481f8a
import sqlite3
import json
import os
import sys
# Add the orchestrator directory to sys.path so we can import config
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from orchestrator.config import DB_PATH
def main():
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Ensure cleaned_note has a processed flag
try:
cursor.execute("ALTER TABLE cleaned_note ADD COLUMN processed INTEGER DEFAULT 0;")
conn.commit()
except sqlite3.OperationalError:
pass
cursor.execute("SELECT id, cleaned_content FROM cleaned_note WHERE processed = 0")
unprocessed_notes = cursor.fetchall()
if not unprocessed_notes:
print("No unprocessed cleaned notes found.")
conn.close()
return
material_count = 0
for note_id, content_str in unprocessed_notes:
try:
data = json.loads(content_str)
except (json.JSONDecodeError, TypeError):
data = {}
title = data.get("title", "").strip()
body = data.get("content", "").strip()
# We can extract title as one material, and body as another material
if title:
cursor.execute("""
INSERT INTO content_material (title, body, type)
VALUES (?, ?, ?)
""", (title, title, "title"))
material_count += 1
if body:
cursor.execute("""
INSERT INTO content_material (title, body, type)
VALUES (?, ?, ?)
""", (title[:20] + "..." if title else "Body Material", body, "body"))
material_count += 1
cursor.execute("UPDATE cleaned_note SET processed = 1 WHERE id = ?", (note_id,))
conn.commit()
print(f"Successfully extracted {material_count} materials from {len(unprocessed_notes)} cleaned notes.")
conn.close()
if __name__ == "__main__":
main()