|
|
|
|
|
import os, sys, yaml, json, subprocess, logging, traceback, time, tarfile, re |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s [%(levelname)s] %(message)s', |
|
|
handlers=[ |
|
|
logging.StreamHandler(sys.stdout), |
|
|
logging.FileHandler('/tmp/setup_debug.log', mode='w') |
|
|
] |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
STATUS_FILE = Path('/tmp/setup_status.json') |
|
|
READY_FLAG = Path('/tmp/faiss_ready') |
|
|
|
|
|
def update_status(status, message, progress=0): |
|
|
data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()} |
|
|
with open(STATUS_FILE, 'w') as f: |
|
|
json.dump(data, f) |
|
|
logger.info(f"STATUS [{progress}%]: {status} - {message}") |
|
|
|
|
|
def run_cmd(cmd, desc, check=True, timeout=300): |
|
|
logger.info("="*80) |
|
|
logger.info(f"🔧 {desc}") |
|
|
logger.info(f"📝 {cmd}") |
|
|
logger.info("-"*80) |
|
|
|
|
|
try: |
|
|
start = time.time() |
|
|
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout, check=check) |
|
|
elapsed = time.time() - start |
|
|
|
|
|
logger.info(f"⏱️ {elapsed:.2f}s | Exit: {result.returncode}") |
|
|
|
|
|
if result.stdout and len(result.stdout.strip()) > 0: |
|
|
logger.info(f"STDOUT: {result.stdout[:500]}") |
|
|
if result.stderr and len(result.stderr.strip()) > 0: |
|
|
logger.warning(f"STDERR: {result.stderr[:500]}") |
|
|
|
|
|
if result.returncode == 0: |
|
|
logger.info(f"✅ {desc} - OK") |
|
|
|
|
|
return result.stdout |
|
|
|
|
|
except subprocess.CalledProcessError as e: |
|
|
logger.error(f"❌ FALHOU: {desc}") |
|
|
raise |
|
|
|
|
|
def clean_html_text(text): |
|
|
if not text or not isinstance(text, str): |
|
|
return "" |
|
|
text = re.sub(r'<[^>]+>', '', text) |
|
|
text = re.sub(r'&[a-zA-Z]+;', ' ', text) |
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
return text.strip() |
|
|
|
|
|
def is_valid_value(value): |
|
|
if value is None: |
|
|
return False |
|
|
if isinstance(value, str): |
|
|
return bool(value.strip()) |
|
|
if isinstance(value, (int, float)): |
|
|
return True |
|
|
return True |
|
|
|
|
|
def filter_and_validate_record(record, fields_to_keep): |
|
|
filtered = {} |
|
|
|
|
|
for field in fields_to_keep: |
|
|
value = None |
|
|
|
|
|
if field in record: |
|
|
value = record[field] |
|
|
else: |
|
|
for variant in [field.lower(), field.capitalize(), field.upper()]: |
|
|
if variant in record: |
|
|
value = record[variant] |
|
|
break |
|
|
|
|
|
if value is None: |
|
|
return None, False |
|
|
|
|
|
if not is_valid_value(value): |
|
|
return None, False |
|
|
|
|
|
field_name = 'id' if field in ['Id', 'id', 'ID'] else field.lower() |
|
|
|
|
|
if field.lower() == 'ementa' and isinstance(value, str): |
|
|
cleaned_value = clean_html_text(value) |
|
|
if not cleaned_value or not cleaned_value.strip(): |
|
|
return None, False |
|
|
filtered[field_name] = cleaned_value |
|
|
else: |
|
|
filtered[field_name] = value |
|
|
|
|
|
return filtered, True |
|
|
|
|
|
def process_tar_gz(tar_path, output_jsonl, fields_to_keep): |
|
|
logger.info(f"📦 {tar_path.name}") |
|
|
|
|
|
stats = {'total': 0, 'validos': 0} |
|
|
|
|
|
try: |
|
|
with tarfile.open(tar_path, 'r:gz') as tar: |
|
|
for member in tar.getmembers(): |
|
|
if member.name.endswith('jurisprudencias.jsonl') and member.isfile(): |
|
|
logger.info(f" ✅ {member.name}") |
|
|
|
|
|
file_obj = tar.extractfile(member) |
|
|
content = file_obj.read().decode('utf-8') |
|
|
|
|
|
lines = content.strip().split('\n') |
|
|
stats['total'] = len(lines) |
|
|
|
|
|
with open(output_jsonl, 'a', encoding='utf-8') as out: |
|
|
for line in lines: |
|
|
if not line.strip(): |
|
|
continue |
|
|
|
|
|
try: |
|
|
record = json.loads(line) |
|
|
filtered, is_valid = filter_and_validate_record(record, fields_to_keep) |
|
|
|
|
|
if is_valid: |
|
|
out.write(json.dumps(filtered, ensure_ascii=False) + '\n') |
|
|
stats['validos'] += 1 |
|
|
except: |
|
|
pass |
|
|
|
|
|
logger.info(f" ✅ {stats['validos']}/{stats['total']}") |
|
|
return stats['validos'] |
|
|
return 0 |
|
|
except Exception as e: |
|
|
logger.error(f" ❌ {e}") |
|
|
raise |
|
|
|
|
|
def main(): |
|
|
try: |
|
|
logger.info("\n" + "="*80) |
|
|
logger.info("🚀 PARA.AI RAG SETUP v3.5") |
|
|
logger.info("="*80) |
|
|
|
|
|
if READY_FLAG.exists(): |
|
|
update_status('ready', 'Ready', 100) |
|
|
return |
|
|
|
|
|
with open('config.yaml') as f: |
|
|
config = yaml.safe_load(f) |
|
|
|
|
|
chunk_start = config['chunk_start'] |
|
|
chunk_end = config['chunk_end'] |
|
|
github_repo = config['github_repo'] |
|
|
campos_filter = config['campos_filter'] |
|
|
|
|
|
base_url = github_repo.replace('https://github.com/', 'https://raw.githubusercontent.com/') |
|
|
if base_url.endswith('.git'): |
|
|
base_url = base_url[:-4] |
|
|
base_url = f"{base_url}/main/chunks_dados" |
|
|
|
|
|
work_dir = Path('/tmp/work') |
|
|
work_dir.mkdir(exist_ok=True) |
|
|
|
|
|
output_jsonl = work_dir / 'all_filtered.jsonl' |
|
|
if output_jsonl.exists(): |
|
|
output_jsonl.unlink() |
|
|
|
|
|
logger.info("\n📥 Download") |
|
|
update_status('downloading', 'Downloading', 10) |
|
|
|
|
|
total_validos = 0 |
|
|
|
|
|
for chunk_num in range(chunk_start, chunk_end + 1): |
|
|
chunk_name = f"chunk_dados_{chunk_num:06d}.tar.gz" |
|
|
chunk_url = f"{base_url}/{chunk_name}" |
|
|
chunk_path = work_dir / chunk_name |
|
|
|
|
|
try: |
|
|
run_cmd(f"curl -L -f -o {chunk_path} {chunk_url}", f"Chunk {chunk_num}", timeout=300) |
|
|
|
|
|
if chunk_path.exists(): |
|
|
validos = process_tar_gz(chunk_path, output_jsonl, campos_filter) |
|
|
total_validos += validos |
|
|
chunk_path.unlink() |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f" ❌ {e}") |
|
|
if chunk_path.exists(): |
|
|
chunk_path.unlink() |
|
|
|
|
|
logger.info(f"\n✅ Total: {total_validos}") |
|
|
|
|
|
if total_validos == 0: |
|
|
raise Exception("Nenhum registro!") |
|
|
|
|
|
logger.info("\n🤖 Build FAISS") |
|
|
update_status('building', 'Building', 70) |
|
|
|
|
|
os.chdir('/home/user/app') |
|
|
|
|
|
result = subprocess.run( |
|
|
f"python3 rag_builder.py --input {output_jsonl}", |
|
|
shell=True, |
|
|
capture_output=True, |
|
|
text=True, |
|
|
timeout=900 |
|
|
) |
|
|
|
|
|
if result.stdout: |
|
|
for line in result.stdout.split('\n'): |
|
|
if line.strip(): |
|
|
logger.info(line) |
|
|
|
|
|
if result.stderr: |
|
|
for line in result.stderr.split('\n'): |
|
|
if line.strip(): |
|
|
logger.warning(line) |
|
|
|
|
|
if result.returncode != 0: |
|
|
raise Exception(f"Build falhou: exit {result.returncode}") |
|
|
|
|
|
logger.info("✅ OK!") |
|
|
run_cmd(f"rm -rf {work_dir}", "Cleanup", check=False) |
|
|
|
|
|
update_status('ready', f'{total_validos} docs', 100) |
|
|
READY_FLAG.touch() |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"\n💥 {e}") |
|
|
logger.error(traceback.format_exc()) |
|
|
update_status('error', str(e), 0) |
|
|
sys.exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|