File size: 7,609 Bytes
9fdb4cf 539078c 9fdb4cf c80efb1 e1a830c c80efb1 9fdb4cf 4a3f00d 9fdb4cf e1a830c 9fdb4cf e1a830c c80efb1 e1a830c 4a3f00d c80efb1 4a3f00d c80efb1 e1a830c c80efb1 4a3f00d e1a830c 4a3f00d e1a830c c80efb1 4a3f00d e1a830c c80efb1 e1a830c c80efb1 539078c e3cd434 539078c 4a3f00d 539078c 4a3f00d 539078c 4a3f00d 539078c 4a3f00d 539078c 4a3f00d 539078c 4a3f00d c80efb1 e1a830c e3cd434 c80efb1 d8d5c48 539078c e1a830c e3cd434 e1a830c e3cd434 e1a830c d8d5c48 e1a830c d8d5c48 4a3f00d e1a830c 4a3f00d d8d5c48 e1a830c d8d5c48 e1a830c e3cd434 e1a830c 9fdb4cf c80efb1 bcc6e2c 9fdb4cf e1a830c d8d5c48 e1a830c c80efb1 e1a830c 9fdb4cf c80efb1 e1a830c c80efb1 e1a830c c80efb1 e1a830c c80efb1 e1a830c c80efb1 d8d5c48 c80efb1 4a3f00d c80efb1 e1a830c c80efb1 e1a830c d8d5c48 c80efb1 d8d5c48 c80efb1 e1a830c 539078c e1a830c c80efb1 e3cd434 c80efb1 4a3f00d d8d5c48 c80efb1 d8d5c48 9fdb4cf e3cd434 4a3f00d d8d5c48 e3cd434 4a3f00d 9fdb4cf e3cd434 d8d5c48 e3cd434 d8d5c48 e3cd434 d8d5c48 e3cd434 d8d5c48 e1a830c c80efb1 d8d5c48 c80efb1 9fdb4cf d8d5c48 c80efb1 4a3f00d 9fdb4cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
#!/usr/bin/env python3
import os, sys, yaml, json, subprocess, logging, traceback, time, tarfile, re
from pathlib import Path
from datetime import datetime
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('/tmp/setup_debug.log', mode='w')
]
)
logger = logging.getLogger(__name__)
STATUS_FILE = Path('/tmp/setup_status.json')
READY_FLAG = Path('/tmp/faiss_ready')
def update_status(status, message, progress=0):
data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()}
with open(STATUS_FILE, 'w') as f:
json.dump(data, f)
logger.info(f"STATUS [{progress}%]: {status} - {message}")
def run_cmd(cmd, desc, check=True, timeout=300):
logger.info("="*80)
logger.info(f"🔧 {desc}")
logger.info(f"📝 {cmd}")
logger.info("-"*80)
try:
start = time.time()
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout, check=check)
elapsed = time.time() - start
logger.info(f"⏱️ {elapsed:.2f}s | Exit: {result.returncode}")
if result.stdout and len(result.stdout.strip()) > 0:
logger.info(f"STDOUT: {result.stdout[:500]}")
if result.stderr and len(result.stderr.strip()) > 0:
logger.warning(f"STDERR: {result.stderr[:500]}")
if result.returncode == 0:
logger.info(f"✅ {desc} - OK")
return result.stdout
except subprocess.CalledProcessError as e:
logger.error(f"❌ FALHOU: {desc}")
raise
def clean_html_text(text):
if not text or not isinstance(text, str):
return ""
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'&[a-zA-Z]+;', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def is_valid_value(value):
if value is None:
return False
if isinstance(value, str):
return bool(value.strip())
if isinstance(value, (int, float)):
return True
return True
def filter_and_validate_record(record, fields_to_keep):
filtered = {}
for field in fields_to_keep:
value = None
if field in record:
value = record[field]
else:
for variant in [field.lower(), field.capitalize(), field.upper()]:
if variant in record:
value = record[variant]
break
if value is None:
return None, False
if not is_valid_value(value):
return None, False
field_name = 'id' if field in ['Id', 'id', 'ID'] else field.lower()
if field.lower() == 'ementa' and isinstance(value, str):
cleaned_value = clean_html_text(value)
if not cleaned_value or not cleaned_value.strip():
return None, False
filtered[field_name] = cleaned_value
else:
filtered[field_name] = value
return filtered, True
def process_tar_gz(tar_path, output_jsonl, fields_to_keep):
logger.info(f"📦 {tar_path.name}")
stats = {'total': 0, 'validos': 0}
try:
with tarfile.open(tar_path, 'r:gz') as tar:
for member in tar.getmembers():
if member.name.endswith('jurisprudencias.jsonl') and member.isfile():
logger.info(f" ✅ {member.name}")
file_obj = tar.extractfile(member)
content = file_obj.read().decode('utf-8')
lines = content.strip().split('\n')
stats['total'] = len(lines)
with open(output_jsonl, 'a', encoding='utf-8') as out:
for line in lines:
if not line.strip():
continue
try:
record = json.loads(line)
filtered, is_valid = filter_and_validate_record(record, fields_to_keep)
if is_valid:
out.write(json.dumps(filtered, ensure_ascii=False) + '\n')
stats['validos'] += 1
except:
pass
logger.info(f" ✅ {stats['validos']}/{stats['total']}")
return stats['validos']
return 0
except Exception as e:
logger.error(f" ❌ {e}")
raise
def main():
try:
logger.info("\n" + "="*80)
logger.info("🚀 PARA.AI RAG SETUP v3.5")
logger.info("="*80)
if READY_FLAG.exists():
update_status('ready', 'Ready', 100)
return
with open('config.yaml') as f:
config = yaml.safe_load(f)
chunk_start = config['chunk_start']
chunk_end = config['chunk_end']
github_repo = config['github_repo']
campos_filter = config['campos_filter']
base_url = github_repo.replace('https://github.com/', 'https://raw.githubusercontent.com/')
if base_url.endswith('.git'):
base_url = base_url[:-4]
base_url = f"{base_url}/main/chunks_dados"
work_dir = Path('/tmp/work')
work_dir.mkdir(exist_ok=True)
output_jsonl = work_dir / 'all_filtered.jsonl'
if output_jsonl.exists():
output_jsonl.unlink()
logger.info("\n📥 Download")
update_status('downloading', 'Downloading', 10)
total_validos = 0
for chunk_num in range(chunk_start, chunk_end + 1):
chunk_name = f"chunk_dados_{chunk_num:06d}.tar.gz"
chunk_url = f"{base_url}/{chunk_name}"
chunk_path = work_dir / chunk_name
try:
run_cmd(f"curl -L -f -o {chunk_path} {chunk_url}", f"Chunk {chunk_num}", timeout=300)
if chunk_path.exists():
validos = process_tar_gz(chunk_path, output_jsonl, campos_filter)
total_validos += validos
chunk_path.unlink()
except Exception as e:
logger.error(f" ❌ {e}")
if chunk_path.exists():
chunk_path.unlink()
logger.info(f"\n✅ Total: {total_validos}")
if total_validos == 0:
raise Exception("Nenhum registro!")
logger.info("\n🤖 Build FAISS")
update_status('building', 'Building', 70)
os.chdir('/home/user/app')
result = subprocess.run(
f"python3 rag_builder.py --input {output_jsonl}",
shell=True,
capture_output=True,
text=True,
timeout=900
)
if result.stdout:
for line in result.stdout.split('\n'):
if line.strip():
logger.info(line)
if result.stderr:
for line in result.stderr.split('\n'):
if line.strip():
logger.warning(line)
if result.returncode != 0:
raise Exception(f"Build falhou: exit {result.returncode}")
logger.info("✅ OK!")
run_cmd(f"rm -rf {work_dir}", "Cleanup", check=False)
update_status('ready', f'{total_validos} docs', 100)
READY_FLAG.touch()
except Exception as e:
logger.error(f"\n💥 {e}")
logger.error(traceback.format_exc())
update_status('error', str(e), 0)
sys.exit(1)
if __name__ == "__main__":
main()
|