Spaces:
Sleeping
Sleeping
File size: 5,284 Bytes
01d5a5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import os
import json
import pandas as pd
import numpy as np
from typing import Optional, Dict
from lpm_kernel.models.l1 import L1Bio, L1Shade, L1Cluster, L1ChunkTopic
from lpm_kernel.common.repository.database_session import DatabaseSession
# Output file mapping for each process step
output_files = {
"extract_dimensional_topics": os.path.join(os.getcwd(), "resources/L2/data_pipeline/raw_data/topics.json"),
"map_your_entity_network": os.path.join(os.getcwd(), "resources/L1/graphrag_indexing_output/subjective/entities.parquet"),
"decode_preference_patterns": os.path.join(os.getcwd(), "resources/L2/data/preference.json"),
"reinforce_identity": os.path.join(os.getcwd(), "resources/L2/data/selfqa.json"),
"augment_content_retention": os.path.join(os.getcwd(), "resources/L2/data/diversity.json"),
}
def query_l1_version_data(version: int) -> dict:
"""
Query L1 bio and shades for a given version and return as dict.
"""
with DatabaseSession.session() as session:
# Get all data for this version
bio = session.query(L1Bio).filter(L1Bio.version == version).first()
shades = session.query(L1Shade).filter(L1Shade.version == version).all()
clusters = (
session.query(L1Cluster).filter(L1Cluster.version == version).all()
)
chunk_topics = (
session.query(L1ChunkTopic)
.filter(L1ChunkTopic.version == version)
.all()
)
if not bio:
return jsonify(APIResponse.error(f"Version {version} not found"))
# Build response data
data = {
"file_type": "json",
"content": {
"version": version,
"bio": {
"content": bio.content,
"content_third_view": bio.content_third_view,
"summary": bio.summary,
"summary_third_view": bio.summary_third_view,
"shades": [
{
"name": s.name,
"aspect": s.aspect,
"icon": s.icon,
"desc_third_view": s.desc_third_view,
"content_third_view": s.content_third_view,
"desc_second_view": s.desc_second_view,
"content_second_view": s.content_second_view,
}
for s in shades
],
},
"clusters": [
{
"cluster_id": c.cluster_id,
"memory_ids": c.memory_ids,
"cluster_center": c.cluster_center,
}
for c in clusters
],
"chunk_topics": [
{"chunk_id": t.chunk_id, "topic": t.topic, "tags": t.tags}
for t in chunk_topics
],
}
}
return data
def read_file_content(file_path: str) -> Optional[Dict]:
"""Read content from a file based on its type."""
try:
if file_path.endswith(".json"):
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = json.load(f)
return {
"file_type": "json",
"content": content
}
except Exception as e:
print(f"Error reading JSON file {file_path}: {str(e)}")
return None
elif file_path.endswith(".parquet"):
return read_parquet_file(file_path)
else:
print(f"Unsupported file type for {file_path}")
return None
except Exception as e:
print(f"Error reading file {file_path}: {str(e)}")
return None
def read_parquet_file(file_path: str) -> Optional[Dict]:
"""
Read a parquet file, convert numpy types for JSON serialization, and return file metadata and content.
"""
try:
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super(NumpyEncoder, self).default(obj)
df = pd.read_parquet(file_path)
# Remove columns named 'x' and 'y' if they exist
df = df.drop(columns=[col for col in ['x', 'y'] if col in df.columns])
df_dict = df.to_dict(orient='records')
json_str = json.dumps(df_dict, cls=NumpyEncoder)
records = json.loads(json_str)
return {
"file_type": "parquet",
"rows": len(df),
"columns": list(df.columns),
"size_bytes": os.path.getsize(file_path),
"content": records
}
except Exception as e:
print(f"Error reading parquet file {file_path}: {str(e)}")
return None
|