Files changed (3) hide show
  1. Dockerfile +20 -18
  2. requirements.txt +1 -2
  3. app.py → train.py +48 -63
Dockerfile CHANGED
@@ -1,34 +1,36 @@
 
1
  FROM python:3.9-slim-bookworm
2
 
 
3
  WORKDIR /app
4
 
5
- # Install git + LFS for data cloning
6
- RUN apt-get update && apt-get install -y git git-lfs && git-lfs install && rm -rf /var/lib/apt/lists/*
7
-
8
- # Clone data files from the space repo
9
  RUN git clone https://huggingface.co/spaces/Finish-him/prometheus-embedding-generator ./dados && cd dados && git lfs pull
10
 
11
- # Python deps
12
  COPY requirements.txt .
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
 
15
- # Cache dir for models
16
  ENV HF_HOME=/app/cache/huggingface
17
  ENV SENTENCE_TRANSFORMERS_HOME=/app/cache/torch
18
- RUN mkdir -p $HF_HOME $SENTENCE_TRANSFORMERS_HOME
19
 
20
- # Pre-download model
21
- RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('intfloat/multilingual-e5-large', cache_folder='/app/cache/torch')"
22
 
23
- # Create output dirs with proper permissions
24
- RUN mkdir -p /app/dados_extraidos /app/output && \
25
- chown -R 1000:1000 /app/dados_extraidos /app/output /app/cache
26
 
27
- # Copy the actual app file (was train.py, now app.py)
28
- COPY app.py .
 
 
 
29
 
30
- ENV GRADIO_SERVER_NAME="0.0.0.0"
31
- ENV GRADIO_SERVER_PORT="7860"
32
- EXPOSE 7860
33
 
34
- CMD ["python", "app.py"]
 
 
 
1
+ # Usa uma imagem base mais recente e suportada
2
  FROM python:3.9-slim-bookworm
3
 
4
+ # Define o diretório de trabalho
5
  WORKDIR /app
6
 
7
+ # --- ETAPA 1: INSTALAR O GIT E CLONAR O SEU DATASET COMPLETO ---
8
+ RUN apt-get update && apt-get install -y git git-lfs && git-lfs install
 
 
9
  RUN git clone https://huggingface.co/spaces/Finish-him/prometheus-embedding-generator ./dados && cd dados && git lfs pull
10
 
11
+ # --- ETAPA 2: CONFIGURAR O AMBIENTE PYTHON E PRÉ-AQUECER O CACHE ---
12
  COPY requirements.txt .
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
 
15
+ # Define o caminho da cache
16
  ENV HF_HOME=/app/cache/huggingface
17
  ENV SENTENCE_TRANSFORMERS_HOME=/app/cache/torch
18
+ RUN mkdir -p $HF_HOME && mkdir -p $SENTENCE_TRANSFORMERS_HOME
19
 
20
+ # Pré-aquece o cache com o modelo e5-large
21
+ RUN python -c "import os; from sentence_transformers import SentenceTransformer; SentenceTransformer('intfloat/multilingual-e5-large', cache_folder=os.environ.get('SENTENCE_TRANSFORMERS_HOME', '/app/cache/torch'))"
22
 
23
+ # --- ETAPA 3: PREPARAR DIRETÓRIOS E COPIAR SCRIPT ---
 
 
24
 
25
+ # *** A CORREÇÃO ESTÁ AQUI ***
26
+ # Cria os diretórios necessários e dá permissão ao usuário padrão (1000)
27
+ RUN mkdir -p /app/dados_extraidos && \
28
+ mkdir -p /app/output && \
29
+ chown -R 1000:1000 /app/dados_extraidos /app/output /app/cache
30
 
31
+ # Copia o script de treinamento
32
+ COPY train.py .
 
33
 
34
+ # --- ETAPA 4: EXECUTAR O SCRIPT ---
35
+ # Executa o script e, após o término, mantém o contêiner a correr.
36
+ CMD ["sh", "-c", "python train.py && sleep infinity"]
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
  sentence-transformers
2
  numpy
3
  torch
4
- tqdm
5
- gradio==4.44.0
 
1
  sentence-transformers
2
  numpy
3
  torch
4
+ tqdm
 
app.py → train.py RENAMED
@@ -1,27 +1,24 @@
1
- # --- 1. IMPORTS ---
2
  import os
3
  import glob
4
  import json
5
  import csv
6
  import numpy as np
 
7
  from sentence_transformers import SentenceTransformer
8
  import zipfile
9
  import xml.etree.ElementTree as ET
10
- import gradio as gr
11
- import shutil
12
 
13
- # --- 2. CONFIGURAÇÕES ---
14
- DATA_DIR = "dados"
15
- EXTRACT_DIR = os.path.join(DATA_DIR, "dados_extraidos")
16
- OUTPUT_FILENAME = "meus_embeddings_e5_large.npy"
17
 
18
- # --- 3. FUNÇÕES DE PROCESSAMENTO ---
19
  def setup_data():
20
  os.makedirs(EXTRACT_DIR, exist_ok=True)
21
- zip_files = glob.glob(os.path.join(DATA_DIR, "*.zip"))
22
  if not zip_files:
 
23
  return DATA_DIR
24
  for zip_path in zip_files:
 
25
  with zipfile.ZipFile(zip_path, 'r') as zf:
26
  zf.extractall(EXTRACT_DIR)
27
  return EXTRACT_DIR
@@ -46,87 +43,75 @@ def serialize_item_to_text(item_dict):
46
  return str(item_dict)
47
  for key, value in item_dict.items():
48
  if isinstance(value, dict):
49
- parts.append(f"{key} ({serialize_item_to_text(value)})")
 
50
  elif isinstance(value, list):
51
- list_str = ", ".join([serialize_item_to_text(i) for i in value])
52
  parts.append(f"{key}: [{list_str}]")
53
  else:
54
  parts.append(f"{key}: {value}")
55
  return ", ".join(parts)
56
 
57
- # --- 4. PIPELINE PRINCIPAL ---
58
- def run_full_process():
59
- yield "Iniciando... Descompactando arquivos..."
60
  process_dir = setup_data()
61
-
62
  csv.field_size_limit(10_000_000)
63
- all_files = (
64
- glob.glob(os.path.join(process_dir, "**/*.json"), recursive=True)
65
- + glob.glob(os.path.join(process_dir, "**/*.csv"), recursive=True)
66
- + glob.glob(os.path.join(process_dir, "**/*.xml"), recursive=True)
67
- )
68
- yield f"Encontrados {len(all_files)} arquivos para processar."
 
 
69
 
70
  documents = []
71
- for idx, filepath in enumerate(all_files):
72
  try:
73
- yield f"Processando arquivo {idx + 1}/{len(all_files)}: {os.path.basename(filepath)}"
74
- if filepath.endswith(".json"):
75
- with open(filepath, "r", encoding="utf-8") as f:
76
  data = json.load(f)
77
  if isinstance(data, list):
78
- for item in data:
79
- documents.append(serialize_item_to_text(item))
80
  else:
81
  documents.append(serialize_item_to_text(data))
82
- elif filepath.endswith(".csv"):
83
- with open(filepath, "r", encoding="utf-8") as f:
84
  reader = csv.DictReader(f)
85
- for row in reader:
86
- documents.append(serialize_item_to_text(row))
87
- elif filepath.endswith(".xml"):
88
  tree = ET.parse(filepath)
89
  root = tree.getroot()
90
  xml_dict = {root.tag: xml_to_dict(root)}
91
  documents.append(serialize_item_to_text(xml_dict))
92
  except Exception as e:
93
- yield f"Erro ao processar {os.path.basename(filepath)}: {e}"
94
 
95
- yield f"Processamento concluido! {len(documents)} documentos criados."
96
  if not documents:
97
- yield "Nenhum documento encontrado. Processo encerrado."
98
  return
99
 
100
- # --- ETAPA 2: GERAÇÃO DE EMBEDDINGS ---
101
- yield "Carregando modelo intfloat/multilingual-e5-large..."
102
- cache_path = "./model_cache"
103
- os.makedirs(cache_path, exist_ok=True)
104
  model = SentenceTransformer(
105
- "intfloat/multilingual-e5-large", cache_folder=cache_path
 
106
  )
 
 
 
 
 
 
107
 
108
- yield f"Gerando embeddings para {len(documents)} documentos..."
109
- batch_size = 32
110
- all_embeddings = []
111
- for i in range(0, len(documents), batch_size):
112
- batch = documents[i : i + batch_size]
113
- embeddings = model.encode(batch, show_progress_bar=False)
114
- all_embeddings.append(embeddings)
115
- yield f"Batch {i // batch_size + 1}/{(len(documents) - 1) // batch_size + 1} concluido."
116
-
117
- final_embeddings = np.vstack(all_embeddings)
118
- np.save(OUTPUT_FILENAME, final_embeddings)
119
- yield f"Embeddings salvos em {OUTPUT_FILENAME}! Shape: {final_embeddings.shape}"
120
- yield f"Processo completo! {final_embeddings.shape[0]} embeddings de dimensao {final_embeddings.shape[1]}."
121
-
122
- # --- 5. INTERFACE GRADIO ---
123
- with gr.Blocks(title="Prometheus Embedding Generator") as demo:
124
- gr.Markdown("# Prometheus Embedding Generator")
125
- gr.Markdown("Gera embeddings a partir dos dados do repositorio usando multilingual-e5-large.")
126
-
127
- run_btn = gr.Button("Iniciar Processamento", variant="primary")
128
- output = gr.Textbox(label="Progresso", lines=15, interactive=False)
129
 
130
- run_btn.click(fn=run_full_process, outputs=output)
131
 
132
- demo.launch()
 
 
 
1
  import os
2
  import glob
3
  import json
4
  import csv
5
  import numpy as np
6
+ from tqdm.auto import tqdm
7
  from sentence_transformers import SentenceTransformer
8
  import zipfile
9
  import xml.etree.ElementTree as ET
 
 
10
 
11
+ DATA_DIR = "/app/dados"
12
+ EXTRACT_DIR = "/app/dados_extraidos"
 
 
13
 
 
14
  def setup_data():
15
  os.makedirs(EXTRACT_DIR, exist_ok=True)
16
+ zip_files = glob.glob(DATA_DIR + "/**/*.zip", recursive=True)
17
  if not zip_files:
18
+ print("Nenhum arquivo .zip encontrado, usando o diretório de dados principal.")
19
  return DATA_DIR
20
  for zip_path in zip_files:
21
+ print(f"Descompactando {zip_path}...")
22
  with zipfile.ZipFile(zip_path, 'r') as zf:
23
  zf.extractall(EXTRACT_DIR)
24
  return EXTRACT_DIR
 
43
  return str(item_dict)
44
  for key, value in item_dict.items():
45
  if isinstance(value, dict):
46
+ nested_text = serialize_item_to_text(value)
47
+ parts.append(f"{key} ({nested_text})")
48
  elif isinstance(value, list):
49
+ list_str = ', '.join([serialize_item_to_text(i) for i in value])
50
  parts.append(f"{key}: [{list_str}]")
51
  else:
52
  parts.append(f"{key}: {value}")
53
  return ", ".join(parts)
54
 
55
+ def main():
 
 
56
  process_dir = setup_data()
 
57
  csv.field_size_limit(10_000_000)
58
+
59
+ all_files = glob.glob(process_dir + "/**/*.json", recursive=True) + \
60
+ glob.glob(process_dir + "/**/*.csv", recursive=True) + \
61
+ glob.glob(process_dir + "/**/*.xml", recursive=True)
62
+ print(f"\n🔎 Encontrados {len(all_files)} arquivos (JSON, CSV, XML) para processar.")
63
+
64
+ if not all_files:
65
+ return
66
 
67
  documents = []
68
+ for filepath in tqdm(all_files, desc="Processando arquivos"):
69
  try:
70
+ if filepath.endswith('.json'):
71
+ with open(filepath, 'r', encoding='utf-8') as f:
 
72
  data = json.load(f)
73
  if isinstance(data, list):
74
+ for item in data: documents.append(serialize_item_to_text(item))
 
75
  else:
76
  documents.append(serialize_item_to_text(data))
77
+ elif filepath.endswith('.csv'):
78
+ with open(filepath, 'r', encoding='utf-8') as f:
79
  reader = csv.DictReader(f)
80
+ for row in reader: documents.append(serialize_item_to_text(row))
81
+ elif filepath.endswith('.xml'):
 
82
  tree = ET.parse(filepath)
83
  root = tree.getroot()
84
  xml_dict = {root.tag: xml_to_dict(root)}
85
  documents.append(serialize_item_to_text(xml_dict))
86
  except Exception as e:
87
+ print(f"⚠️ Erro ao processar o arquivo {filepath}: {e}")
88
 
89
+ print(f"\nProcessamento de arquivos concluído! {len(documents)} documentos foram criados.")
90
  if not documents:
 
91
  return
92
 
93
+ cache_path = os.environ.get('SENTENCE_TRANSFORMERS_HOME', '/app/cache/torch')
94
+
95
+ print("Carregando modelo de alta performance: intfloat/multilingual-e5-large")
 
96
  model = SentenceTransformer(
97
+ 'intfloat/multilingual-e5-large',
98
+ cache_folder=cache_path
99
  )
100
+
101
+ batch_size = 128
102
+ output_filename = '/app/output/meus_embeddings_e5_large.npy'
103
+
104
+ if os.path.exists(output_filename):
105
+ os.remove(output_filename)
106
 
107
+ print(f"🚀 Iniciando geração de embeddings (lotes de {batch_size}).")
108
+ for i in tqdm(range(0, len(documents), batch_size), desc="Gerando Embeddings"):
109
+ batch = documents[i:i + batch_size]
110
+ batch_embeddings = model.encode(batch, show_progress_bar=False)
111
+ with open(output_filename, 'ab') as f_out:
112
+ np.save(f_out, batch_embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ print(f"✅ Processo finalizado! Embeddings salvos em '{output_filename}'.")
115
 
116
+ if __name__ == "__main__":
117
+ main()