File size: 5,284 Bytes
01d5a5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import json
import pandas as pd
import numpy as np
from typing import Optional, Dict
from lpm_kernel.models.l1 import L1Bio, L1Shade, L1Cluster, L1ChunkTopic
from lpm_kernel.common.repository.database_session import DatabaseSession

# Output file mapping for each process step
output_files = {
    "extract_dimensional_topics": os.path.join(os.getcwd(), "resources/L2/data_pipeline/raw_data/topics.json"),
    "map_your_entity_network": os.path.join(os.getcwd(), "resources/L1/graphrag_indexing_output/subjective/entities.parquet"),
    "decode_preference_patterns": os.path.join(os.getcwd(), "resources/L2/data/preference.json"),
    "reinforce_identity": os.path.join(os.getcwd(), "resources/L2/data/selfqa.json"),
    "augment_content_retention": os.path.join(os.getcwd(), "resources/L2/data/diversity.json"),
}

def query_l1_version_data(version: int) -> dict:
    """
    Query L1 bio and shades for a given version and return as dict.
    """
    with DatabaseSession.session() as session:
            # Get all data for this version
            bio = session.query(L1Bio).filter(L1Bio.version == version).first()

            shades = session.query(L1Shade).filter(L1Shade.version == version).all()

            clusters = (
                session.query(L1Cluster).filter(L1Cluster.version == version).all()
            )

            chunk_topics = (
                session.query(L1ChunkTopic)
                .filter(L1ChunkTopic.version == version)
                .all()
            )

            if not bio:
                return jsonify(APIResponse.error(f"Version {version} not found"))

            # Build response data
            data = {
                "file_type": "json",
                "content": {
                    "version": version,
                    "bio": {
                        "content": bio.content,
                        "content_third_view": bio.content_third_view,
                        "summary": bio.summary,
                    "summary_third_view": bio.summary_third_view,
                    "shades": [
                        {
                            "name": s.name,
                            "aspect": s.aspect,
                            "icon": s.icon,
                            "desc_third_view": s.desc_third_view,
                            "content_third_view": s.content_third_view,
                            "desc_second_view": s.desc_second_view,
                            "content_second_view": s.content_second_view,
                        }
                        for s in shades
                    ],
                    },
                    "clusters": [
                        {
                            "cluster_id": c.cluster_id,
                            "memory_ids": c.memory_ids,
                            "cluster_center": c.cluster_center,
                        }
                        for c in clusters
                    ],
                    "chunk_topics": [
                        {"chunk_id": t.chunk_id, "topic": t.topic, "tags": t.tags}
                        for t in chunk_topics
                    ],
                }
            }
            return data

def read_file_content(file_path: str) -> Optional[Dict]:
    """Read content from a file based on its type."""
    try:
        if file_path.endswith(".json"):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = json.load(f)
                return {
                    "file_type": "json",
                    "content": content
                }
            except Exception as e:
                print(f"Error reading JSON file {file_path}: {str(e)}")
                return None
        elif file_path.endswith(".parquet"):
            return read_parquet_file(file_path)
        else:
            print(f"Unsupported file type for {file_path}")
            return None
    except Exception as e:
        print(f"Error reading file {file_path}: {str(e)}")
        return None

def read_parquet_file(file_path: str) -> Optional[Dict]:
    """
    Read a parquet file, convert numpy types for JSON serialization, and return file metadata and content.
    """
    try:
        class NumpyEncoder(json.JSONEncoder):
            def default(self, obj):
                if isinstance(obj, np.integer):
                    return int(obj)
                if isinstance(obj, np.floating):
                    return float(obj)
                if isinstance(obj, np.ndarray):
                    return obj.tolist()
                return super(NumpyEncoder, self).default(obj)

        df = pd.read_parquet(file_path)
        # Remove columns named 'x' and 'y' if they exist
        df = df.drop(columns=[col for col in ['x', 'y'] if col in df.columns])
        df_dict = df.to_dict(orient='records')
        json_str = json.dumps(df_dict, cls=NumpyEncoder)
        records = json.loads(json_str)
        return {
            "file_type": "parquet",
            "rows": len(df),
            "columns": list(df.columns),
            "size_bytes": os.path.getsize(file_path),
            "content": records
        }
    except Exception as e:
        print(f"Error reading parquet file {file_path}: {str(e)}")
        return None