import mimetypes import pandas as pd import PyPDF2 import json import re import spacy import numpy as np from transformers import AutoTokenizer, AutoModel import torch import os # Load SpaCy model with a check to ensure it's downloaded try: nlp = spacy.load("en_core_web_sm") except OSError: os.system("python -m spacy download en_core_web_sm") nlp = spacy.load("en_core_web_sm") # Detect file type def detect_file_type(file_path): file_type = mimetypes.guess_type(file_path)[0] if file_type in ["application/pdf"]: return "pdf" elif file_type in ["text/csv", "application/vnd.ms-excel"]: return "csv" elif file_type == "application/json": return "json" else: raise ValueError(f"Unsupported file format: {file_type}") # Extract text from CSV def extract_text_from_csv(file_path): df = pd.read_csv(file_path) text = " ".join(df.astype(str).stack()) return text # Extract text from PDF def extract_text_from_pdf(file_path): pdf_reader = PyPDF2.PdfReader(file_path) text = "" for page in pdf_reader.pages: text += page.extract_text() return text # Extract text from JSON def extract_text_from_json(file_path): def recursive_text_extraction(data): if isinstance(data, dict): return " ".join(recursive_text_extraction(value) for value in data.values()) elif isinstance(data, list): return " ".join(recursive_text_extraction(item) for item in data) else: return str(data) with open(file_path, 'r') as f: data = json.load(f) return recursive_text_extraction(data) # Generalized text extraction def extract_text(file_path): file_type = detect_file_type(file_path) if file_type == "csv": return extract_text_from_csv(file_path) elif file_type == "pdf": return extract_text_from_pdf(file_path) elif file_type == "json": return extract_text_from_json(file_path) else: raise ValueError("Unsupported file format") # Preprocess text def preprocess_text_generalized(text): text = re.sub(r"http\S+|www\S+|https\S+", "", text) text = re.sub(r"[^\x20-\x7E]", "", text) text = re.sub(r"\s+", " ", text) chunk_size = 100000 chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] processed_chunks = [] for chunk in chunks: doc = nlp(chunk.lower()) tokens = [ token.lemma_ for token in doc if not token.is_stop and token.is_alpha ] processed_chunks.append(" ".join(tokens)) processed_text = " ".join(processed_chunks) return processed_text # Generate embeddings def get_embeddings_from_huggingface(cleaned_text, model_name="sentence-transformers/all-MiniLM-L6-v2"): tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state sentence_embeddings = embeddings.mean(dim=1).numpy() return sentence_embeddings