Marcos Morales
commited on
Commit
·
dd58f3d
1
Parent(s):
11af251
modified: README.md
Browse filesmodified: app.py
modified: requirements.txt
new file: scripts/run_preprocess.sh
new file: src/__init__.py
new file: src/chunker.py
new file: src/config.py
new file: src/embeddings.py
new file: src/preprocess.py
new file: src/reader.py
- README.md +10 -13
- app.py +27 -85
- requirements.txt +15 -6
- scripts/run_preprocess.sh +6 -0
- src/__init__.py +1 -0
- src/chunker.py +17 -0
- src/config.py +14 -0
- src/embeddings.py +19 -0
- src/preprocess.py +41 -0
- src/reader.py +41 -0
README.md
CHANGED
|
@@ -1,14 +1,11 @@
|
|
| 1 |
-
|
| 2 |
-
title: Chunkings
|
| 3 |
-
emoji: 🏃
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: blue
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 5.38.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
license: apache-2.0
|
| 11 |
-
short_description: Docs to chunks
|
| 12 |
-
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HF Vector Pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
Pipeline end‑to‑end para convertir documentos (`.md`, `.docx`, `.pdf`)
|
| 4 |
+
en un **JSONL con embeddings** y metadatos, listo para cargar en
|
| 5 |
+
**Amazon S3 Vector Features**.
|
| 6 |
+
|
| 7 |
+
Incluye:
|
| 8 |
+
|
| 9 |
+
* CLI (`python -m src.preprocess …`)
|
| 10 |
+
* UI Gradio (archivo **app.py**) preparada para HuggingFace Spaces
|
| 11 |
+
* Soporte para Windows 11 + VSCode
|
app.py
CHANGED
|
@@ -1,97 +1,39 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import yaml
|
| 3 |
-
import json
|
| 4 |
-
import uuid
|
| 5 |
from pathlib import Path
|
| 6 |
-
from
|
| 7 |
-
import
|
| 8 |
-
from
|
| 9 |
-
import tiktoken
|
| 10 |
-
|
| 11 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 12 |
-
tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 13 |
-
|
| 14 |
-
def extract_front_matter_and_body(text: str):
|
| 15 |
-
import re
|
| 16 |
-
fm_regex = r"^---\n(.*?)\n---\n(.*)$"
|
| 17 |
-
m = re.match(fm_regex, text, re.DOTALL)
|
| 18 |
-
if m:
|
| 19 |
-
meta = yaml.safe_load(m.group(1)) or {}
|
| 20 |
-
body = m.group(2)
|
| 21 |
-
else:
|
| 22 |
-
meta = {}
|
| 23 |
-
body = text
|
| 24 |
-
return meta, body
|
| 25 |
-
|
| 26 |
-
def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50):
|
| 27 |
-
tokens = tokenizer.encode(text)
|
| 28 |
-
chunks = []
|
| 29 |
-
start = 0
|
| 30 |
-
while start < len(tokens):
|
| 31 |
-
end = min(start + max_tokens, len(tokens))
|
| 32 |
-
chunk_toks = tokens[start:end]
|
| 33 |
-
chunks.append(tokenizer.decode(chunk_toks))
|
| 34 |
-
start += max_tokens - overlap
|
| 35 |
-
return chunks
|
| 36 |
-
|
| 37 |
-
def process_file(path: str, vertical: str, language: str):
|
| 38 |
-
ext = Path(path).suffix.lower()
|
| 39 |
-
if ext in ['.md', '.markdown']:
|
| 40 |
-
raw = Path(path).read_text(encoding='utf-8')
|
| 41 |
-
meta, body = extract_front_matter_and_body(raw)
|
| 42 |
-
elif ext == '.docx':
|
| 43 |
-
doc = Document(path)
|
| 44 |
-
body = "\n".join(p.text for p in doc.paragraphs)
|
| 45 |
-
meta = {}
|
| 46 |
-
elif ext == '.pdf':
|
| 47 |
-
reader = PyPDF2.PdfReader(path)
|
| 48 |
-
pages = [page.extract_text() or "" for page in reader.pages]
|
| 49 |
-
body = "\n".join(pages)
|
| 50 |
-
meta = {}
|
| 51 |
-
else:
|
| 52 |
-
return []
|
| 53 |
-
|
| 54 |
-
default_meta = {
|
| 55 |
-
'vertical': vertical,
|
| 56 |
-
'language': language,
|
| 57 |
-
'source': Path(path).name
|
| 58 |
-
}
|
| 59 |
-
meta = {**default_meta, **meta}
|
| 60 |
-
records = []
|
| 61 |
-
for i, chunk in enumerate(chunk_text(body)):
|
| 62 |
-
emb = model.encode(chunk).tolist()
|
| 63 |
-
metadata = {
|
| 64 |
-
'id': f"{Path(path).stem}-chunk-{i+1:04d}",
|
| 65 |
-
'chunk_index': i+1,
|
| 66 |
-
**meta
|
| 67 |
-
}
|
| 68 |
-
records.append({'vector': emb, 'metadata': metadata})
|
| 69 |
-
return records
|
| 70 |
|
| 71 |
def run_pipeline(files, vertical, language):
|
| 72 |
-
|
| 73 |
for file_path in files:
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
f.write("\n")
|
| 82 |
-
return
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
gr.Markdown("## Ingesta para Amazon S3 Vector Features")
|
| 87 |
with gr.Row():
|
| 88 |
-
uploader = gr.File(label="Sube
|
| 89 |
-
vertical = gr.Textbox(label="Vertical
|
| 90 |
language = gr.Textbox(label="Idioma", value="es")
|
| 91 |
-
btn = gr.Button("Procesar y
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
btn.click(fn=run_pipeline, inputs=[uploader, vertical, language], outputs=output)
|
| 95 |
|
| 96 |
if __name__ == "__main__":
|
| 97 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import yaml, json, uuid, os
|
|
|
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
+
from src.reader import read_file
|
| 5 |
+
from src.chunker import chunk_text
|
| 6 |
+
from src.embeddings import embed_texts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def run_pipeline(files, vertical, language):
|
| 9 |
+
recs = []
|
| 10 |
for file_path in files:
|
| 11 |
+
meta, body = read_file(Path(file_path))
|
| 12 |
+
base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
|
| 13 |
+
chunks = chunk_text(body)
|
| 14 |
+
vecs = embed_texts(chunks)
|
| 15 |
+
for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
|
| 16 |
+
recs.append({
|
| 17 |
+
"id": f"{Path(file_path).stem}-chunk-{i:04d}",
|
| 18 |
+
"vector": vec,
|
| 19 |
+
"metadata": {**base_meta, "chunk_index": i}
|
| 20 |
+
})
|
| 21 |
+
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
|
| 22 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 23 |
+
for r in recs:
|
| 24 |
+
json.dump(r, f, ensure_ascii=False)
|
| 25 |
f.write("\n")
|
| 26 |
+
return out_path
|
| 27 |
|
| 28 |
+
with gr.Blocks() as demo:
|
| 29 |
+
gr.Markdown("## Ingesta para Amazon S3 Vector Features")
|
|
|
|
| 30 |
with gr.Row():
|
| 31 |
+
uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath")
|
| 32 |
+
vertical = gr.Textbox(label="Vertical", value="general")
|
| 33 |
language = gr.Textbox(label="Idioma", value="es")
|
| 34 |
+
btn = gr.Button("Procesar y generar JSONL")
|
| 35 |
+
outfile = gr.File(label="Descarga JSONL")
|
| 36 |
+
btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile)
|
|
|
|
| 37 |
|
| 38 |
if __name__ == "__main__":
|
| 39 |
demo.launch()
|
requirements.txt
CHANGED
|
@@ -1,6 +1,15 @@
|
|
| 1 |
-
|
| 2 |
-
pyyaml
|
| 3 |
-
python-docx
|
| 4 |
-
PyPDF2
|
| 5 |
-
sentence-transformers
|
| 6 |
-
tiktoken
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core processing
|
| 2 |
+
pyyaml>=6.0
|
| 3 |
+
python-docx>=1.0
|
| 4 |
+
PyPDF2>=3.0
|
| 5 |
+
sentence-transformers>=2.7
|
| 6 |
+
tiktoken>=0.7
|
| 7 |
+
|
| 8 |
+
# CLI
|
| 9 |
+
click>=8.1
|
| 10 |
+
|
| 11 |
+
# UI (HuggingFace Space)
|
| 12 |
+
gradio>=4.32
|
| 13 |
+
|
| 14 |
+
# Opcional: variables de entorno
|
| 15 |
+
python-dotenv>=1.0
|
scripts/run_preprocess.sh
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
python -m src.preprocess \
|
| 3 |
+
--input-dir sample_docs \
|
| 4 |
+
--output dist/output.jsonl \
|
| 5 |
+
--vertical SEO-LLM \
|
| 6 |
+
--language es
|
src/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Package marker."""
|
src/chunker.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Chunking token‑based."""
|
| 2 |
+
from typing import List
|
| 3 |
+
import tiktoken
|
| 4 |
+
from .config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 5 |
+
|
| 6 |
+
_tok = tiktoken.get_encoding("cl100k_base")
|
| 7 |
+
|
| 8 |
+
def chunk_text(text: str,
|
| 9 |
+
max_tokens: int = CHUNK_SIZE,
|
| 10 |
+
overlap: int = CHUNK_OVERLAP) -> List[str]:
|
| 11 |
+
tokens = _tok.encode(text)
|
| 12 |
+
out, start, step = [], 0, max_tokens - overlap
|
| 13 |
+
while start < len(tokens):
|
| 14 |
+
end = min(start + max_tokens, len(tokens))
|
| 15 |
+
out.append(_tok.decode(tokens[start:end]))
|
| 16 |
+
start += step
|
| 17 |
+
return out
|
src/config.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Carga de configuración y constantes globales."""
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
ENV_PATH = Path(__file__).resolve().parent.parent / ".env"
|
| 7 |
+
if ENV_PATH.exists():
|
| 8 |
+
load_dotenv(ENV_PATH)
|
| 9 |
+
|
| 10 |
+
EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "intfloat/e5-large-v2")
|
| 11 |
+
DEVICE: str = os.getenv("DEVICE", "cpu")
|
| 12 |
+
|
| 13 |
+
CHUNK_SIZE: int = 500 # tokens por chunk
|
| 14 |
+
CHUNK_OVERLAP: int = 50 # solape entre chunks
|
src/embeddings.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SentenceTransformer wrapper."""
|
| 2 |
+
from typing import List
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
from .config import EMBEDDING_MODEL, DEVICE
|
| 5 |
+
|
| 6 |
+
_model: SentenceTransformer | None = None
|
| 7 |
+
|
| 8 |
+
def _model_instance() -> SentenceTransformer:
|
| 9 |
+
global _model
|
| 10 |
+
if _model is None:
|
| 11 |
+
_model = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
| 12 |
+
return _model
|
| 13 |
+
|
| 14 |
+
def embed_texts(texts: List[str]) -> List[List[float]]:
|
| 15 |
+
return _model_instance().encode(
|
| 16 |
+
texts,
|
| 17 |
+
show_progress_bar=False,
|
| 18 |
+
convert_to_numpy=False
|
| 19 |
+
).tolist()
|
src/preprocess.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI: lee → chunkea → embed → JSONL."""
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
import json
|
| 5 |
+
import click
|
| 6 |
+
from .reader import read_file
|
| 7 |
+
from .chunker import chunk_text
|
| 8 |
+
from .embeddings import embed_texts
|
| 9 |
+
|
| 10 |
+
@click.command()
|
| 11 |
+
@click.option("--input-dir", type=click.Path(exists=True, file_okay=False),
|
| 12 |
+
required=True, help="Carpeta con documentos.")
|
| 13 |
+
@click.option("--output", type=click.Path(), required=True,
|
| 14 |
+
help="Ruta del JSONL de salida.")
|
| 15 |
+
@click.option("--vertical", default="general", help="Vertical.")
|
| 16 |
+
@click.option("--language", default="es", help="Idioma.")
|
| 17 |
+
def main(input_dir: str, output: str, vertical: str, language: str):
|
| 18 |
+
recs: List[Dict] = []
|
| 19 |
+
for p in Path(input_dir).iterdir():
|
| 20 |
+
if not p.is_file():
|
| 21 |
+
continue
|
| 22 |
+
meta, body = read_file(p)
|
| 23 |
+
base_meta = {"vertical": vertical, "language": language, "source": p.name, **meta}
|
| 24 |
+
chunks = chunk_text(body)
|
| 25 |
+
vecs = embed_texts(chunks)
|
| 26 |
+
for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
|
| 27 |
+
recs.append({
|
| 28 |
+
"id": f"{p.stem}-chunk-{i:04d}",
|
| 29 |
+
"vector": vec,
|
| 30 |
+
"metadata": {**base_meta, "chunk_index": i}
|
| 31 |
+
})
|
| 32 |
+
out = Path(output)
|
| 33 |
+
out.parent.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
with out.open("w", encoding="utf-8") as f:
|
| 35 |
+
for r in recs:
|
| 36 |
+
json.dump(r, f, ensure_ascii=False)
|
| 37 |
+
f.write("\n")
|
| 38 |
+
click.echo(f"Wrote {len(recs)} records → {out}")
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
main()
|
src/reader.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Lectura de Markdown, DOCX y PDF con front‑matter opcional."""
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Tuple, Dict
|
| 4 |
+
import re
|
| 5 |
+
import yaml
|
| 6 |
+
from docx import Document as DocxDocument
|
| 7 |
+
import PyPDF2
|
| 8 |
+
|
| 9 |
+
_FM = re.compile(r"^---\n(.*?)\n---\n(.*)$", re.DOTALL)
|
| 10 |
+
|
| 11 |
+
def _split_fm(text: str) -> Tuple[Dict, str]:
|
| 12 |
+
m = _FM.match(text)
|
| 13 |
+
if m:
|
| 14 |
+
meta_raw, body = m.groups()
|
| 15 |
+
meta = yaml.safe_load(meta_raw) or {}
|
| 16 |
+
return meta, body
|
| 17 |
+
return {}, text
|
| 18 |
+
|
| 19 |
+
def _read_md(path: Path) -> Tuple[Dict, str]:
|
| 20 |
+
raw = path.read_text(encoding="utf-8")
|
| 21 |
+
return _split_fm(raw)
|
| 22 |
+
|
| 23 |
+
def _read_docx(path: Path) -> Tuple[Dict, str]:
|
| 24 |
+
doc = DocxDocument(path)
|
| 25 |
+
body = "\n".join(p.text for p in doc.paragraphs)
|
| 26 |
+
return {}, body
|
| 27 |
+
|
| 28 |
+
def _read_pdf(path: Path) -> Tuple[Dict, str]:
|
| 29 |
+
r = PyPDF2.PdfReader(str(path))
|
| 30 |
+
body = "\n".join(page.extract_text() or "" for page in r.pages)
|
| 31 |
+
return {}, body
|
| 32 |
+
|
| 33 |
+
def read_file(path: Path) -> Tuple[Dict, str]:
|
| 34 |
+
ext = path.suffix.lower()
|
| 35 |
+
if ext in {".md", ".markdown"}:
|
| 36 |
+
return _read_md(path)
|
| 37 |
+
if ext == ".docx":
|
| 38 |
+
return _read_docx(path)
|
| 39 |
+
if ext == ".pdf":
|
| 40 |
+
return _read_pdf(path)
|
| 41 |
+
raise ValueError(f"Formato no soportado: {ext}")
|