Spaces:
Runtime error
Runtime error
| import os | |
| import faiss | |
| import numpy as np | |
| import torch | |
| from sentence_transformers import SentenceTransformer | |
| from .file_handler import extract_text_from_file | |
| v_device = "cuda" if torch.cuda.is_available() else "cpu" | |
| obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device) | |
| def process_files_to_vectors(v_folder_path): | |
| """ | |
| Processes files (PDF, PPTX, CSV) found in v_folder_path to build a FAISS vector DB. | |
| Stores only the reference path in metadata.json (no file name or full path). | |
| """ | |
| v_vector_folder = os.path.join(v_folder_path, 'vectors') | |
| os.makedirs(v_vector_folder, exist_ok=True) | |
| # Create a FAISS index (384 dimensions for all-MiniLM-L6-v2) | |
| v_index = faiss.IndexFlatL2(384) | |
| v_metadata = {} | |
| v_doc_counter = 0 | |
| for v_root, _, v_files in os.walk(v_folder_path): | |
| for v_file in v_files: | |
| v_file_path = os.path.join(v_root, v_file) | |
| # Filter files by extension | |
| if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')): | |
| v_text = extract_text_from_file(v_file_path) | |
| if not v_text.strip(): | |
| continue # skip empty files | |
| # Convert text to embeddings | |
| v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy() | |
| v_index.add(v_embeddings) | |
| # Generate a relative path (reference path only) | |
| v_reference_path = os.path.relpath(v_file_path, start=v_folder_path) | |
| # Store only the reference path in metadata | |
| v_metadata[v_doc_counter] = v_reference_path | |
| v_doc_counter += 1 | |
| # Save the FAISS index | |
| v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss') | |
| faiss.write_index(v_index, v_index_path) | |
| # Save metadata (containing only reference paths) | |
| import json | |
| with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta: | |
| json.dump(v_metadata, obj_meta, indent=4) | |
| return v_vector_folder | |