Jagukumar commited on
Commit
1b10704
·
verified ·
1 Parent(s): b919219

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +42 -0
  2. processing.py +93 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from processing import extract_text, preprocess_text_generalized, get_embeddings_from_huggingface
2
+ import gradio as gr
3
+ import numpy as np
4
+
5
+ def process_file(file_path):
6
+ try:
7
+ # Step 1: Extract text
8
+ extracted_text = extract_text(file_path)
9
+
10
+ # Step 2: Preprocess text
11
+ cleaned_text = preprocess_text_generalized(extracted_text)
12
+
13
+ # Step 3: Generate embeddings
14
+ embeddings = get_embeddings_from_huggingface(cleaned_text)
15
+
16
+ # Step 4: Save embeddings to a temporary file
17
+ temp_file_path = "embeddings.npy"
18
+ np.save(temp_file_path, embeddings)
19
+
20
+ # Return the top 10 embeddings and the file path for download
21
+ top_10_embeddings = embeddings[:10].tolist()
22
+ return f"Top 10 Embeddings: {top_10_embeddings}", temp_file_path
23
+ except Exception as e:
24
+ return str(e), None
25
+
26
+ # Define Gradio Interface
27
+ interface = gr.Interface(
28
+ fn=process_file,
29
+ inputs=gr.File(label="Upload a file (CSV, PDF, JSON)", type="filepath"),
30
+ outputs=[
31
+ gr.Textbox(label="Top 10 Embeddings"),
32
+ gr.File(label="Download Full Embeddings"),
33
+ ],
34
+ title="Embedding Converter Using Hugging Face Model",
35
+ description=(
36
+ "Upload a file (CSV, PDF, or JSON) to generate embeddings using "
37
+ "Hugging Face models. View the top 10 embeddings and download entire embedding file."
38
+ ),
39
+ )
40
+
41
+ if __name__ == "__main__":
42
+ interface.launch()
processing.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mimetypes
2
+ import pandas as pd
3
+ import PyPDF2
4
+ import json
5
+ import re
6
+ import spacy
7
+ import numpy as np
8
+ from transformers import AutoTokenizer, AutoModel
9
+ import torch
10
+
11
+ # Load SpaCy model
12
+ nlp = spacy.load("en_core_web_sm")
13
+
14
+ # Detect file type
15
+ def detect_file_type(file_path):
16
+ file_type = mimetypes.guess_type(file_path)[0]
17
+ if file_type in ["application/pdf"]:
18
+ return "pdf"
19
+ elif file_type in ["text/csv", "application/vnd.ms-excel"]:
20
+ return "csv"
21
+ elif file_type == "application/json":
22
+ return "json"
23
+ else:
24
+ raise ValueError(f"Unsupported file format: {file_type}")
25
+
26
+ # Extract text from CSV
27
+ def extract_text_from_csv(file_path):
28
+ df = pd.read_csv(file_path)
29
+ text = " ".join(df.astype(str).stack())
30
+ return text
31
+
32
+ # Extract text from PDF
33
+ def extract_text_from_pdf(file_path):
34
+ pdf_reader = PyPDF2.PdfReader(file_path)
35
+ text = ""
36
+ for page in pdf_reader.pages:
37
+ text += page.extract_text()
38
+ return text
39
+
40
+ # Extract text from JSON
41
+ def extract_text_from_json(file_path):
42
+ def recursive_text_extraction(data):
43
+ if isinstance(data, dict):
44
+ return " ".join(recursive_text_extraction(value) for value in data.values())
45
+ elif isinstance(data, list):
46
+ return " ".join(recursive_text_extraction(item) for item in data)
47
+ else:
48
+ return str(data)
49
+ with open(file_path, 'r') as f:
50
+ data = json.load(f)
51
+ return recursive_text_extraction(data)
52
+
53
+ # Generalized text extraction
54
+ def extract_text(file_path):
55
+ file_type = detect_file_type(file_path)
56
+ if file_type == "csv":
57
+ return extract_text_from_csv(file_path)
58
+ elif file_type == "pdf":
59
+ return extract_text_from_pdf(file_path)
60
+ elif file_type == "json":
61
+ return extract_text_from_json(file_path)
62
+ else:
63
+ raise ValueError("Unsupported file format")
64
+
65
+ # Preprocess text
66
+ def preprocess_text_generalized(text):
67
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text)
68
+ text = re.sub(r"[^\x20-\x7E]", "", text)
69
+ text = re.sub(r"\s+", " ", text)
70
+ chunk_size = 100000
71
+ chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
72
+ processed_chunks = []
73
+ for chunk in chunks:
74
+ doc = nlp(chunk.lower())
75
+ tokens = [
76
+ token.lemma_
77
+ for token in doc
78
+ if not token.is_stop and token.is_alpha
79
+ ]
80
+ processed_chunks.append(" ".join(tokens))
81
+ processed_text = " ".join(processed_chunks)
82
+ return processed_text
83
+
84
+ # Generate embeddings
85
+ def get_embeddings_from_huggingface(cleaned_text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
86
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
87
+ model = AutoModel.from_pretrained(model_name)
88
+ inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
89
+ with torch.no_grad():
90
+ outputs = model(**inputs)
91
+ embeddings = outputs.last_hidden_state
92
+ sentence_embeddings = embeddings.mean(dim=1).numpy()
93
+ return sentence_embeddings
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ pandas
4
+ PyPDF2
5
+ ipykernel
6
+ spacy
7
+ torch