Jagukumar commited on
Commit
ae65157
·
verified ·
1 Parent(s): a0ce01f

Update processing.py

Browse files
Files changed (1) hide show
  1. processing.py +99 -93
processing.py CHANGED
@@ -1,93 +1,99 @@
1
- import mimetypes
2
- import pandas as pd
3
- import PyPDF2
4
- import json
5
- import re
6
- import spacy
7
- import numpy as np
8
- from transformers import AutoTokenizer, AutoModel
9
- import torch
10
-
11
- # Load SpaCy model
12
- nlp = spacy.load("en_core_web_sm")
13
-
14
- # Detect file type
15
- def detect_file_type(file_path):
16
- file_type = mimetypes.guess_type(file_path)[0]
17
- if file_type in ["application/pdf"]:
18
- return "pdf"
19
- elif file_type in ["text/csv", "application/vnd.ms-excel"]:
20
- return "csv"
21
- elif file_type == "application/json":
22
- return "json"
23
- else:
24
- raise ValueError(f"Unsupported file format: {file_type}")
25
-
26
- # Extract text from CSV
27
- def extract_text_from_csv(file_path):
28
- df = pd.read_csv(file_path)
29
- text = " ".join(df.astype(str).stack())
30
- return text
31
-
32
- # Extract text from PDF
33
- def extract_text_from_pdf(file_path):
34
- pdf_reader = PyPDF2.PdfReader(file_path)
35
- text = ""
36
- for page in pdf_reader.pages:
37
- text += page.extract_text()
38
- return text
39
-
40
- # Extract text from JSON
41
- def extract_text_from_json(file_path):
42
- def recursive_text_extraction(data):
43
- if isinstance(data, dict):
44
- return " ".join(recursive_text_extraction(value) for value in data.values())
45
- elif isinstance(data, list):
46
- return " ".join(recursive_text_extraction(item) for item in data)
47
- else:
48
- return str(data)
49
- with open(file_path, 'r') as f:
50
- data = json.load(f)
51
- return recursive_text_extraction(data)
52
-
53
- # Generalized text extraction
54
- def extract_text(file_path):
55
- file_type = detect_file_type(file_path)
56
- if file_type == "csv":
57
- return extract_text_from_csv(file_path)
58
- elif file_type == "pdf":
59
- return extract_text_from_pdf(file_path)
60
- elif file_type == "json":
61
- return extract_text_from_json(file_path)
62
- else:
63
- raise ValueError("Unsupported file format")
64
-
65
- # Preprocess text
66
- def preprocess_text_generalized(text):
67
- text = re.sub(r"http\S+|www\S+|https\S+", "", text)
68
- text = re.sub(r"[^\x20-\x7E]", "", text)
69
- text = re.sub(r"\s+", " ", text)
70
- chunk_size = 100000
71
- chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
72
- processed_chunks = []
73
- for chunk in chunks:
74
- doc = nlp(chunk.lower())
75
- tokens = [
76
- token.lemma_
77
- for token in doc
78
- if not token.is_stop and token.is_alpha
79
- ]
80
- processed_chunks.append(" ".join(tokens))
81
- processed_text = " ".join(processed_chunks)
82
- return processed_text
83
-
84
- # Generate embeddings
85
- def get_embeddings_from_huggingface(cleaned_text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
86
- tokenizer = AutoTokenizer.from_pretrained(model_name)
87
- model = AutoModel.from_pretrained(model_name)
88
- inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
89
- with torch.no_grad():
90
- outputs = model(**inputs)
91
- embeddings = outputs.last_hidden_state
92
- sentence_embeddings = embeddings.mean(dim=1).numpy()
93
- return sentence_embeddings
 
 
 
 
 
 
 
1
+ import mimetypes
2
+ import pandas as pd
3
+ import PyPDF2
4
+ import json
5
+ import re
6
+ import spacy
7
+ import numpy as np
8
+ from transformers import AutoTokenizer, AutoModel
9
+ import torch
10
+
11
+ import os
12
+
13
+ # Load SpaCy model with a check to ensure it's downloaded
14
+ try:
15
+ nlp = spacy.load("en_core_web_sm")
16
+ except OSError:
17
+ os.system("python -m spacy download en_core_web_sm")
18
+ nlp = spacy.load("en_core_web_sm")
19
+
20
+ # Detect file type
21
+ def detect_file_type(file_path):
22
+ file_type = mimetypes.guess_type(file_path)[0]
23
+ if file_type in ["application/pdf"]:
24
+ return "pdf"
25
+ elif file_type in ["text/csv", "application/vnd.ms-excel"]:
26
+ return "csv"
27
+ elif file_type == "application/json":
28
+ return "json"
29
+ else:
30
+ raise ValueError(f"Unsupported file format: {file_type}")
31
+
32
+ # Extract text from CSV
33
+ def extract_text_from_csv(file_path):
34
+ df = pd.read_csv(file_path)
35
+ text = " ".join(df.astype(str).stack())
36
+ return text
37
+
38
+ # Extract text from PDF
39
+ def extract_text_from_pdf(file_path):
40
+ pdf_reader = PyPDF2.PdfReader(file_path)
41
+ text = ""
42
+ for page in pdf_reader.pages:
43
+ text += page.extract_text()
44
+ return text
45
+
46
+ # Extract text from JSON
47
+ def extract_text_from_json(file_path):
48
+ def recursive_text_extraction(data):
49
+ if isinstance(data, dict):
50
+ return " ".join(recursive_text_extraction(value) for value in data.values())
51
+ elif isinstance(data, list):
52
+ return " ".join(recursive_text_extraction(item) for item in data)
53
+ else:
54
+ return str(data)
55
+ with open(file_path, 'r') as f:
56
+ data = json.load(f)
57
+ return recursive_text_extraction(data)
58
+
59
+ # Generalized text extraction
60
+ def extract_text(file_path):
61
+ file_type = detect_file_type(file_path)
62
+ if file_type == "csv":
63
+ return extract_text_from_csv(file_path)
64
+ elif file_type == "pdf":
65
+ return extract_text_from_pdf(file_path)
66
+ elif file_type == "json":
67
+ return extract_text_from_json(file_path)
68
+ else:
69
+ raise ValueError("Unsupported file format")
70
+
71
+ # Preprocess text
72
+ def preprocess_text_generalized(text):
73
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text)
74
+ text = re.sub(r"[^\x20-\x7E]", "", text)
75
+ text = re.sub(r"\s+", " ", text)
76
+ chunk_size = 100000
77
+ chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
78
+ processed_chunks = []
79
+ for chunk in chunks:
80
+ doc = nlp(chunk.lower())
81
+ tokens = [
82
+ token.lemma_
83
+ for token in doc
84
+ if not token.is_stop and token.is_alpha
85
+ ]
86
+ processed_chunks.append(" ".join(tokens))
87
+ processed_text = " ".join(processed_chunks)
88
+ return processed_text
89
+
90
+ # Generate embeddings
91
+ def get_embeddings_from_huggingface(cleaned_text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
92
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
93
+ model = AutoModel.from_pretrained(model_name)
94
+ inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
95
+ with torch.no_grad():
96
+ outputs = model(**inputs)
97
+ embeddings = outputs.last_hidden_state
98
+ sentence_embeddings = embeddings.mean(dim=1).numpy()
99
+ return sentence_embeddings