Spaces:

Jagukumar
/

Text-To-Embeddings

Running

App Files Files Community

Jagukumar commited on Nov 25, 2024

Commit

1b10704

verified ·

1 Parent(s): b919219

Upload 3 files

Browse files

Files changed (3) hide show

app.py +42 -0
processing.py +93 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from processing import extract_text, preprocess_text_generalized, get_embeddings_from_huggingface
+import gradio as gr
+import numpy as np
+def process_file(file_path):
+    try:
+        # Step 1: Extract text
+        extracted_text = extract_text(file_path)
+        # Step 2: Preprocess text
+        cleaned_text = preprocess_text_generalized(extracted_text)
+        # Step 3: Generate embeddings
+        embeddings = get_embeddings_from_huggingface(cleaned_text)
+        # Step 4: Save embeddings to a temporary file
+        temp_file_path = "embeddings.npy"
+        np.save(temp_file_path, embeddings)
+        # Return the top 10 embeddings and the file path for download
+        top_10_embeddings = embeddings[:10].tolist()
+        return f"Top 10 Embeddings: {top_10_embeddings}", temp_file_path
+    except Exception as e:
+        return str(e), None
+# Define Gradio Interface
+interface = gr.Interface(
+    fn=process_file,
+    inputs=gr.File(label="Upload a file (CSV, PDF, JSON)", type="filepath"),
+    outputs=[
+        gr.Textbox(label="Top 10 Embeddings"),
+        gr.File(label="Download Full Embeddings"),
+    ],
+    title="Embedding Converter Using Hugging Face Model",
+    description=(
+        "Upload a file (CSV, PDF, or JSON) to  generate embeddings using "
+        "Hugging Face models. View the top 10 embeddings and download  entire embedding file."
+    ),
+)
+if __name__ == "__main__":
+    interface.launch()

processing.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import mimetypes
+import pandas as pd
+import PyPDF2
+import json
+import re
+import spacy
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+import torch
+# Load SpaCy model
+nlp = spacy.load("en_core_web_sm")
+# Detect file type
+def detect_file_type(file_path):
+    file_type = mimetypes.guess_type(file_path)[0]
+    if file_type in ["application/pdf"]:
+        return "pdf"
+    elif file_type in ["text/csv", "application/vnd.ms-excel"]:
+        return "csv"
+    elif file_type == "application/json":
+        return "json"
+    else:
+        raise ValueError(f"Unsupported file format: {file_type}")
+# Extract text from CSV
+def extract_text_from_csv(file_path):
+    df = pd.read_csv(file_path)
+    text = " ".join(df.astype(str).stack())
+    return text
+# Extract text from PDF
+def extract_text_from_pdf(file_path):
+    pdf_reader = PyPDF2.PdfReader(file_path)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+# Extract text from JSON
+def extract_text_from_json(file_path):
+    def recursive_text_extraction(data):
+        if isinstance(data, dict):
+            return " ".join(recursive_text_extraction(value) for value in data.values())
+        elif isinstance(data, list):
+            return " ".join(recursive_text_extraction(item) for item in data)
+        else:
+            return str(data)
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return recursive_text_extraction(data)
+# Generalized text extraction
+def extract_text(file_path):
+    file_type = detect_file_type(file_path)
+    if file_type == "csv":
+        return extract_text_from_csv(file_path)
+    elif file_type == "pdf":
+        return extract_text_from_pdf(file_path)
+    elif file_type == "json":
+        return extract_text_from_json(file_path)
+    else:
+        raise ValueError("Unsupported file format")
+# Preprocess text
+def preprocess_text_generalized(text):
+    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
+    text = re.sub(r"[^\x20-\x7E]", "", text)
+    text = re.sub(r"\s+", " ", text)
+    chunk_size = 100000
+    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+    processed_chunks = []
+    for chunk in chunks:
+        doc = nlp(chunk.lower())
+        tokens = [
+            token.lemma_
+            for token in doc
+            if not token.is_stop and token.is_alpha
+        ]
+        processed_chunks.append(" ".join(tokens))
+    processed_text = " ".join(processed_chunks)
+    return processed_text
+# Generate embeddings
+def get_embeddings_from_huggingface(cleaned_text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name)
+    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        embeddings = outputs.last_hidden_state
+    sentence_embeddings = embeddings.mean(dim=1).numpy()
+    return sentence_embeddings

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+transformers
+gradio
+pandas
+PyPDF2
+ipykernel
+spacy
+torch