File size: 1,704 Bytes
e03340e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from processing import extract_text, preprocess_text_generalized, get_embeddings_from_huggingface
import gradio as gr
import numpy as np
import spacy
import os

# Check if SpaCy model is downloaded; if not, download it
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")


def process_file(file_path):
    try:
        # Step 1: Extract text
        extracted_text = extract_text(file_path)
        
        # Step 2: Preprocess text
        cleaned_text = preprocess_text_generalized(extracted_text)
        
        # Step 3: Generate embeddings
        embeddings = get_embeddings_from_huggingface(cleaned_text)
        
        # Step 4: Save embeddings to a temporary file
        temp_file_path = "embeddings.npy"
        np.save(temp_file_path, embeddings)
        
        # Return the top 10 embeddings and the file path for download
        top_10_embeddings = embeddings[:10].tolist()
        return f"Top 10 Embeddings: {top_10_embeddings}", temp_file_path
    except Exception as e:
        return str(e), None

# Define Gradio Interface
interface = gr.Interface(
    fn=process_file,
    inputs=gr.File(label="Upload a file (CSV, PDF, JSON)", type="filepath"),
    outputs=[
        gr.Textbox(label="Top 10 Embeddings"),
        gr.File(label="Download Full Embeddings"),
    ],
    title="Embedding Converter Using Hugging Face Model",
    description=(
        "Upload a file (CSV, PDF, or JSON) to  generate embeddings using "
        "Hugging Face models. View the top 10 embeddings and download  entire embedding file."
    ),
)

if __name__ == "__main__":
    interface.launch()