File size: 7,609 Bytes
9fdb4cf
539078c
9fdb4cf
 
 
c80efb1
e1a830c
c80efb1
 
 
 
 
 
9fdb4cf
 
 
 
 
 
4a3f00d
9fdb4cf
 
e1a830c
9fdb4cf
e1a830c
c80efb1
e1a830c
4a3f00d
c80efb1
 
 
 
4a3f00d
c80efb1
 
e1a830c
c80efb1
4a3f00d
e1a830c
4a3f00d
e1a830c
c80efb1
4a3f00d
 
 
e1a830c
c80efb1
e1a830c
 
c80efb1
 
539078c
 
 
 
 
 
e3cd434
539078c
4a3f00d
 
 
 
 
539078c
 
4a3f00d
 
 
 
 
 
 
539078c
 
 
4a3f00d
539078c
 
 
 
 
 
4a3f00d
 
539078c
4a3f00d
 
539078c
 
 
 
 
 
 
 
 
 
4a3f00d
c80efb1
e1a830c
e3cd434
c80efb1
d8d5c48
539078c
e1a830c
 
e3cd434
e1a830c
e3cd434
e1a830c
 
 
 
 
d8d5c48
e1a830c
 
d8d5c48
4a3f00d
 
e1a830c
4a3f00d
 
 
 
 
 
d8d5c48
 
 
e1a830c
d8d5c48
 
e1a830c
 
e3cd434
e1a830c
9fdb4cf
 
 
c80efb1
bcc6e2c
9fdb4cf
 
e1a830c
d8d5c48
e1a830c
c80efb1
e1a830c
9fdb4cf
 
c80efb1
 
 
e1a830c
c80efb1
e1a830c
 
 
 
c80efb1
e1a830c
 
c80efb1
e1a830c
 
 
c80efb1
d8d5c48
 
c80efb1
4a3f00d
c80efb1
e1a830c
 
 
 
c80efb1
e1a830c
d8d5c48
c80efb1
d8d5c48
 
 
 
c80efb1
e1a830c
539078c
e1a830c
 
c80efb1
e3cd434
c80efb1
4a3f00d
d8d5c48
c80efb1
d8d5c48
 
9fdb4cf
 
 
e3cd434
4a3f00d
 
d8d5c48
e3cd434
4a3f00d
 
9fdb4cf
e3cd434
 
d8d5c48
 
e3cd434
 
 
d8d5c48
 
e3cd434
 
d8d5c48
e3cd434
d8d5c48
e1a830c
c80efb1
d8d5c48
c80efb1
 
9fdb4cf
d8d5c48
c80efb1
4a3f00d
9fdb4cf
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/env python3
import os, sys, yaml, json, subprocess, logging, traceback, time, tarfile, re
from pathlib import Path
from datetime import datetime

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler('/tmp/setup_debug.log', mode='w')
    ]
)
logger = logging.getLogger(__name__)

STATUS_FILE = Path('/tmp/setup_status.json')
READY_FLAG = Path('/tmp/faiss_ready')

def update_status(status, message, progress=0):
    data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()}
    with open(STATUS_FILE, 'w') as f:
        json.dump(data, f)
    logger.info(f"STATUS [{progress}%]: {status} - {message}")

def run_cmd(cmd, desc, check=True, timeout=300):
    logger.info("="*80)
    logger.info(f"🔧 {desc}")
    logger.info(f"📝 {cmd}")
    logger.info("-"*80)

    try:
        start = time.time()
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout, check=check)
        elapsed = time.time() - start

        logger.info(f"⏱️  {elapsed:.2f}s | Exit: {result.returncode}")

        if result.stdout and len(result.stdout.strip()) > 0:
            logger.info(f"STDOUT: {result.stdout[:500]}")
        if result.stderr and len(result.stderr.strip()) > 0:
            logger.warning(f"STDERR: {result.stderr[:500]}")

        if result.returncode == 0:
            logger.info(f"✅ {desc} - OK")

        return result.stdout

    except subprocess.CalledProcessError as e:
        logger.error(f"❌ FALHOU: {desc}")
        raise

def clean_html_text(text):
    if not text or not isinstance(text, str):
        return ""
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'&[a-zA-Z]+;', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def is_valid_value(value):
    if value is None:
        return False
    if isinstance(value, str):
        return bool(value.strip())
    if isinstance(value, (int, float)):
        return True
    return True

def filter_and_validate_record(record, fields_to_keep):
    filtered = {}

    for field in fields_to_keep:
        value = None

        if field in record:
            value = record[field]
        else:
            for variant in [field.lower(), field.capitalize(), field.upper()]:
                if variant in record:
                    value = record[variant]
                    break

        if value is None:
            return None, False

        if not is_valid_value(value):
            return None, False

        field_name = 'id' if field in ['Id', 'id', 'ID'] else field.lower()

        if field.lower() == 'ementa' and isinstance(value, str):
            cleaned_value = clean_html_text(value)
            if not cleaned_value or not cleaned_value.strip():
                return None, False
            filtered[field_name] = cleaned_value
        else:
            filtered[field_name] = value

    return filtered, True

def process_tar_gz(tar_path, output_jsonl, fields_to_keep):
    logger.info(f"📦 {tar_path.name}")

    stats = {'total': 0, 'validos': 0}

    try:
        with tarfile.open(tar_path, 'r:gz') as tar:
            for member in tar.getmembers():
                if member.name.endswith('jurisprudencias.jsonl') and member.isfile():
                    logger.info(f"  ✅ {member.name}")

                    file_obj = tar.extractfile(member)
                    content = file_obj.read().decode('utf-8')

                    lines = content.strip().split('\n')
                    stats['total'] = len(lines)

                    with open(output_jsonl, 'a', encoding='utf-8') as out:
                        for line in lines:
                            if not line.strip():
                                continue

                            try:
                                record = json.loads(line)
                                filtered, is_valid = filter_and_validate_record(record, fields_to_keep)

                                if is_valid:
                                    out.write(json.dumps(filtered, ensure_ascii=False) + '\n')
                                    stats['validos'] += 1
                            except:
                                pass

                    logger.info(f"  ✅ {stats['validos']}/{stats['total']}")
                    return stats['validos']
        return 0
    except Exception as e:
        logger.error(f"  ❌ {e}")
        raise

def main():
    try:
        logger.info("\n" + "="*80)
        logger.info("🚀 PARA.AI RAG SETUP v3.5")
        logger.info("="*80)

        if READY_FLAG.exists():
            update_status('ready', 'Ready', 100)
            return

        with open('config.yaml') as f:
            config = yaml.safe_load(f)

        chunk_start = config['chunk_start']
        chunk_end = config['chunk_end']
        github_repo = config['github_repo']
        campos_filter = config['campos_filter']

        base_url = github_repo.replace('https://github.com/', 'https://raw.githubusercontent.com/')
        if base_url.endswith('.git'):
            base_url = base_url[:-4]
        base_url = f"{base_url}/main/chunks_dados"

        work_dir = Path('/tmp/work')
        work_dir.mkdir(exist_ok=True)

        output_jsonl = work_dir / 'all_filtered.jsonl'
        if output_jsonl.exists():
            output_jsonl.unlink()

        logger.info("\n📥 Download")
        update_status('downloading', 'Downloading', 10)

        total_validos = 0

        for chunk_num in range(chunk_start, chunk_end + 1):
            chunk_name = f"chunk_dados_{chunk_num:06d}.tar.gz"
            chunk_url = f"{base_url}/{chunk_name}"
            chunk_path = work_dir / chunk_name

            try:
                run_cmd(f"curl -L -f -o {chunk_path} {chunk_url}", f"Chunk {chunk_num}", timeout=300)

                if chunk_path.exists():
                    validos = process_tar_gz(chunk_path, output_jsonl, campos_filter)
                    total_validos += validos
                    chunk_path.unlink()

            except Exception as e:
                logger.error(f"  ❌ {e}")
                if chunk_path.exists():
                    chunk_path.unlink()

        logger.info(f"\n✅ Total: {total_validos}")

        if total_validos == 0:
            raise Exception("Nenhum registro!")

        logger.info("\n🤖 Build FAISS")
        update_status('building', 'Building', 70)

        os.chdir('/home/user/app')

        result = subprocess.run(
            f"python3 rag_builder.py --input {output_jsonl}",
            shell=True,
            capture_output=True,
            text=True,
            timeout=900
        )

        if result.stdout:
            for line in result.stdout.split('\n'):
                if line.strip():
                    logger.info(line)

        if result.stderr:
            for line in result.stderr.split('\n'):
                if line.strip():
                    logger.warning(line)

        if result.returncode != 0:
            raise Exception(f"Build falhou: exit {result.returncode}")

        logger.info("✅ OK!")
        run_cmd(f"rm -rf {work_dir}", "Cleanup", check=False)

        update_status('ready', f'{total_validos} docs', 100)
        READY_FLAG.touch()

    except Exception as e:
        logger.error(f"\n💥 {e}")
        logger.error(traceback.format_exc())
        update_status('error', str(e), 0)
        sys.exit(1)

if __name__ == "__main__":
    main()