Spaces:

Jagukumar
/

Text-To-Embeddings

Running

App Files Files Community

Jagukumar commited on Nov 25, 2024

Commit

ae65157

verified ·

1 Parent(s): a0ce01f

Update processing.py

Browse files

Files changed (1) hide show

processing.py +99 -93

processing.py CHANGED Viewed

@@ -1,93 +1,99 @@
-import mimetypes
-import pandas as pd
-import PyPDF2
-import json
-import re
-import spacy
-import numpy as np
-from transformers import AutoTokenizer, AutoModel
-import torch
-# Load SpaCy model
-nlp = spacy.load("en_core_web_sm")
-# Detect file type
-def detect_file_type(file_path):
-    file_type = mimetypes.guess_type(file_path)[0]
-    if file_type in ["application/pdf"]:
-        return "pdf"
-    elif file_type in ["text/csv", "application/vnd.ms-excel"]:
-        return "csv"
-    elif file_type == "application/json":
-        return "json"
-    else:
-        raise ValueError(f"Unsupported file format: {file_type}")
-# Extract text from CSV
-def extract_text_from_csv(file_path):
-    df = pd.read_csv(file_path)
-    text = " ".join(df.astype(str).stack())
-    return text
-# Extract text from PDF
-def extract_text_from_pdf(file_path):
-    pdf_reader = PyPDF2.PdfReader(file_path)
-    text = ""
-    for page in pdf_reader.pages:
-        text += page.extract_text()
-    return text
-# Extract text from JSON
-def extract_text_from_json(file_path):
-    def recursive_text_extraction(data):
-        if isinstance(data, dict):
-            return " ".join(recursive_text_extraction(value) for value in data.values())
-        elif isinstance(data, list):
-            return " ".join(recursive_text_extraction(item) for item in data)
-        else:
-            return str(data)
-    with open(file_path, 'r') as f:
-        data = json.load(f)
-    return recursive_text_extraction(data)
-# Generalized text extraction
-def extract_text(file_path):
-    file_type = detect_file_type(file_path)
-    if file_type == "csv":
-        return extract_text_from_csv(file_path)
-    elif file_type == "pdf":
-        return extract_text_from_pdf(file_path)
-    elif file_type == "json":
-        return extract_text_from_json(file_path)
-    else:
-        raise ValueError("Unsupported file format")
-# Preprocess text
-def preprocess_text_generalized(text):
-    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
-    text = re.sub(r"[^\x20-\x7E]", "", text)
-    text = re.sub(r"\s+", " ", text)
-    chunk_size = 100000
-    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
-    processed_chunks = []
-    for chunk in chunks:
-        doc = nlp(chunk.lower())
-        tokens = [
-            token.lemma_
-            for token in doc
-            if not token.is_stop and token.is_alpha
-        ]
-        processed_chunks.append(" ".join(tokens))
-    processed_text = " ".join(processed_chunks)
-    return processed_text
-# Generate embeddings
-def get_embeddings_from_huggingface(cleaned_text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModel.from_pretrained(model_name)
-    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
-    with torch.no_grad():
-        outputs = model(**inputs)
-        embeddings = outputs.last_hidden_state
-    sentence_embeddings = embeddings.mean(dim=1).numpy()
-    return sentence_embeddings

+import mimetypes
+import pandas as pd
+import PyPDF2
+import json
+import re
+import spacy
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+import torch
+import os
+# Load SpaCy model with a check to ensure it's downloaded
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    os.system("python -m spacy download en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
+# Detect file type
+def detect_file_type(file_path):
+    file_type = mimetypes.guess_type(file_path)[0]
+    if file_type in ["application/pdf"]:
+        return "pdf"
+    elif file_type in ["text/csv", "application/vnd.ms-excel"]:
+        return "csv"
+    elif file_type == "application/json":
+        return "json"
+    else:
+        raise ValueError(f"Unsupported file format: {file_type}")
+# Extract text from CSV
+def extract_text_from_csv(file_path):
+    df = pd.read_csv(file_path)
+    text = " ".join(df.astype(str).stack())
+    return text
+# Extract text from PDF
+def extract_text_from_pdf(file_path):
+    pdf_reader = PyPDF2.PdfReader(file_path)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+# Extract text from JSON
+def extract_text_from_json(file_path):
+    def recursive_text_extraction(data):
+        if isinstance(data, dict):
+            return " ".join(recursive_text_extraction(value) for value in data.values())
+        elif isinstance(data, list):
+            return " ".join(recursive_text_extraction(item) for item in data)
+        else:
+            return str(data)
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return recursive_text_extraction(data)
+# Generalized text extraction
+def extract_text(file_path):
+    file_type = detect_file_type(file_path)
+    if file_type == "csv":
+        return extract_text_from_csv(file_path)
+    elif file_type == "pdf":
+        return extract_text_from_pdf(file_path)
+    elif file_type == "json":
+        return extract_text_from_json(file_path)
+    else:
+        raise ValueError("Unsupported file format")
+# Preprocess text
+def preprocess_text_generalized(text):
+    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
+    text = re.sub(r"[^\x20-\x7E]", "", text)
+    text = re.sub(r"\s+", " ", text)
+    chunk_size = 100000
+    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+    processed_chunks = []
+    for chunk in chunks:
+        doc = nlp(chunk.lower())
+        tokens = [
+            token.lemma_
+            for token in doc
+            if not token.is_stop and token.is_alpha
+        ]
+        processed_chunks.append(" ".join(tokens))
+    processed_text = " ".join(processed_chunks)
+    return processed_text
+# Generate embeddings
+def get_embeddings_from_huggingface(cleaned_text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name)
+    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        embeddings = outputs.last_hidden_state
+    sentence_embeddings = embeddings.mean(dim=1).numpy()
+    return sentence_embeddings