| import gradio as gr | |
| import json | |
| import uuid | |
| from pathlib import Path | |
| from src.reader import read_file | |
| from src.chunker import chunk_text | |
| from src.embeddings import embed_texts | |
| from src.metadata_llm import extract_metadata | |
| def run_pipeline(files, vertical, language): | |
| records = [] | |
| for file_path in files: | |
| meta, body = read_file(Path(file_path)) | |
| base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta} | |
| chunks = chunk_text(body) | |
| vectors = embed_texts(chunks) | |
| for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1): | |
| meta_llm = extract_metadata(chunk) | |
| records.append({ | |
| "id": f"{Path(file_path).stem}-chunk-{i:04d}", | |
| "vector": vec, | |
| "metadata": {**base_meta, "chunk_index": i, **meta_llm} | |
| }) | |
| out_path = f"/tmp/{uuid.uuid4().hex}.jsonl" | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| for r in records: | |
| json.dump(r, f, ensure_ascii=False) | |
| f.write("\n") | |
| return out_path | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Ingesta para Amazon S3 Vector Features") | |
| with gr.Row(): | |
| uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath") | |
| vertical = gr.Textbox(label="Vertical", value="general") | |
| language = gr.Textbox(label="Idioma", value="es") | |
| btn = gr.Button("Procesar y generar JSONL") | |
| outfile = gr.File(label="Descarga JSONL") | |
| btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile) | |
| if __name__ == "__main__": | |
| demo.launch() | |