Jagukumar commited on
Commit
11e725d
·
verified ·
1 Parent(s): 74a5040

Update processing.py

Browse files
Files changed (1) hide show
  1. processing.py +132 -119
processing.py CHANGED
@@ -1,119 +1,132 @@
1
- import mimetypes
2
- import pandas as pd
3
- import PyPDF2
4
- import json
5
- import re
6
- import spacy
7
- import os
8
- from dotenv import load_dotenv
9
- import openai
10
- import numpy as np
11
-
12
- # Load environment variables
13
- load_dotenv()
14
-
15
- # Set OpenAI API Key
16
- openai.api_key = os.getenv("OPENAI_API_KEY")
17
-
18
- # Load SpaCy model
19
- nlp = spacy.load("en_core_web_sm")
20
-
21
- # Detect file type
22
- def detect_file_type(file_path):
23
- file_type = mimetypes.guess_type(file_path)[0]
24
- if file_type in ["application/pdf"]:
25
- return "pdf"
26
- elif file_type in ["text/csv", "application/vnd.ms-excel"]:
27
- return "csv"
28
- elif file_type == "application/json":
29
- return "json"
30
- else:
31
- raise ValueError(f"Unsupported file format: {file_type}")
32
-
33
- # Extract text from CSV
34
- def extract_text_from_csv(file_path):
35
- df = pd.read_csv(file_path)
36
- text = " ".join(df.astype(str).stack())
37
- return text
38
-
39
- # Extract text from PDF
40
- def extract_text_from_pdf(file_path):
41
- pdf_reader = PyPDF2.PdfReader(file_path)
42
- text = ""
43
- for page in pdf_reader.pages:
44
- text += page.extract_text()
45
- return text
46
-
47
- # Extract text from JSON
48
- def extract_text_from_json(file_path):
49
- def recursive_text_extraction(data):
50
- if isinstance(data, dict):
51
- return " ".join(recursive_text_extraction(value) for value in data.values())
52
- elif isinstance(data, list):
53
- return " ".join(recursive_text_extraction(item) for item in data)
54
- else:
55
- return str(data)
56
-
57
- with open(file_path, 'r') as f:
58
- data = json.load(f)
59
- return recursive_text_extraction(data)
60
-
61
- # Generalized text extraction
62
- def extract_text(file_path):
63
- file_type = detect_file_type(file_path)
64
- if file_type == "csv":
65
- return extract_text_from_csv(file_path)
66
- elif file_type == "pdf":
67
- return extract_text_from_pdf(file_path)
68
- elif file_type == "json":
69
- return extract_text_from_json(file_path)
70
- else:
71
- raise ValueError("Unsupported file format")
72
-
73
- # Preprocess text
74
- def preprocess_text_generalized(text):
75
- text = re.sub(r"http\S+|www\S+|https\S+", "", text) # Remove URLs
76
- text = re.sub(r"[^\x20-\x7E]", "", text) # Remove non-ASCII characters
77
- text = re.sub(r"\s+", " ", text) # Normalize whitespace
78
- chunk_size = 100000 # Maximum chunk size
79
- chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
80
- processed_chunks = []
81
- for chunk in chunks:
82
- doc = nlp(chunk.lower())
83
- tokens = [
84
- token.lemma_
85
- for token in doc
86
- if not token.is_stop and token.is_alpha
87
- ]
88
- processed_chunks.append(" ".join(tokens))
89
- processed_text = " ".join(processed_chunks)
90
- return processed_text
91
-
92
- # Generate embeddings using OpenAI API
93
- def get_openai_embeddings(text, model="text-embedding-ada-002"):
94
- """
95
- Generate embeddings for a given text using OpenAI API.
96
- """
97
- try:
98
- response = openai.Embedding.create(input=text, model=model)
99
- embeddings = response["data"][0]["embedding"]
100
- return np.array(embeddings) # Convert to NumPy array for compatibility
101
- except Exception as e:
102
- print(f"Error generating embeddings: {e}")
103
- return None
104
-
105
- # Example usage
106
- if __name__ == "__main__":
107
- # Example file path
108
- file_path = "example.pdf"
109
-
110
- # Extract and preprocess text
111
- raw_text = extract_text(file_path)
112
- preprocessed_text = preprocess_text_generalized(raw_text)
113
-
114
- # Generate embeddings using OpenAI API
115
- embeddings = get_openai_embeddings(preprocessed_text)
116
- if embeddings is not None:
117
- print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
118
- else:
119
- print("Failed to generate embeddings.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mimetypes
2
+ import pandas as pd
3
+ import PyPDF2
4
+ import json
5
+ import re
6
+ import spacy
7
+ import os
8
+ from dotenv import load_dotenv
9
+ import openai
10
+ import numpy as np
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ # Set OpenAI API Key
16
+ openai.api_key = os.getenv("OPENAI_API_KEY")
17
+
18
+ # Load SpaCy model
19
+ # nlp = spacy.load("en_core_web_sm")
20
+
21
+
22
+ import spacy
23
+ from spacy.cli import download
24
+
25
+ # Ensure the model is available
26
+ try:
27
+ nlp = spacy.load("en_core_web_sm")
28
+ except OSError:
29
+ print("Downloading SpaCy 'en_core_web_sm' model...")
30
+ download("en_core_web_sm")
31
+ nlp = spacy.load("en_core_web_sm")
32
+
33
+
34
+ # Detect file type
35
+ def detect_file_type(file_path):
36
+ file_type = mimetypes.guess_type(file_path)[0]
37
+ if file_type in ["application/pdf"]:
38
+ return "pdf"
39
+ elif file_type in ["text/csv", "application/vnd.ms-excel"]:
40
+ return "csv"
41
+ elif file_type == "application/json":
42
+ return "json"
43
+ else:
44
+ raise ValueError(f"Unsupported file format: {file_type}")
45
+
46
+ # Extract text from CSV
47
+ def extract_text_from_csv(file_path):
48
+ df = pd.read_csv(file_path)
49
+ text = " ".join(df.astype(str).stack())
50
+ return text
51
+
52
+ # Extract text from PDF
53
+ def extract_text_from_pdf(file_path):
54
+ pdf_reader = PyPDF2.PdfReader(file_path)
55
+ text = ""
56
+ for page in pdf_reader.pages:
57
+ text += page.extract_text()
58
+ return text
59
+
60
+ # Extract text from JSON
61
+ def extract_text_from_json(file_path):
62
+ def recursive_text_extraction(data):
63
+ if isinstance(data, dict):
64
+ return " ".join(recursive_text_extraction(value) for value in data.values())
65
+ elif isinstance(data, list):
66
+ return " ".join(recursive_text_extraction(item) for item in data)
67
+ else:
68
+ return str(data)
69
+
70
+ with open(file_path, 'r') as f:
71
+ data = json.load(f)
72
+ return recursive_text_extraction(data)
73
+
74
+ # Generalized text extraction
75
+ def extract_text(file_path):
76
+ file_type = detect_file_type(file_path)
77
+ if file_type == "csv":
78
+ return extract_text_from_csv(file_path)
79
+ elif file_type == "pdf":
80
+ return extract_text_from_pdf(file_path)
81
+ elif file_type == "json":
82
+ return extract_text_from_json(file_path)
83
+ else:
84
+ raise ValueError("Unsupported file format")
85
+
86
+ # Preprocess text
87
+ def preprocess_text_generalized(text):
88
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text) # Remove URLs
89
+ text = re.sub(r"[^\x20-\x7E]", "", text) # Remove non-ASCII characters
90
+ text = re.sub(r"\s+", " ", text) # Normalize whitespace
91
+ chunk_size = 100000 # Maximum chunk size
92
+ chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
93
+ processed_chunks = []
94
+ for chunk in chunks:
95
+ doc = nlp(chunk.lower())
96
+ tokens = [
97
+ token.lemma_
98
+ for token in doc
99
+ if not token.is_stop and token.is_alpha
100
+ ]
101
+ processed_chunks.append(" ".join(tokens))
102
+ processed_text = " ".join(processed_chunks)
103
+ return processed_text
104
+
105
+ # Generate embeddings using OpenAI API
106
+ def get_openai_embeddings(text, model="text-embedding-ada-002"):
107
+ """
108
+ Generate embeddings for a given text using OpenAI API.
109
+ """
110
+ try:
111
+ response = openai.Embedding.create(input=text, model=model)
112
+ embeddings = response["data"][0]["embedding"]
113
+ return np.array(embeddings) # Convert to NumPy array for compatibility
114
+ except Exception as e:
115
+ print(f"Error generating embeddings: {e}")
116
+ return None
117
+
118
+ # Example usage
119
+ if __name__ == "__main__":
120
+ # Example file path
121
+ file_path = "example.pdf"
122
+
123
+ # Extract and preprocess text
124
+ raw_text = extract_text(file_path)
125
+ preprocessed_text = preprocess_text_generalized(raw_text)
126
+
127
+ # Generate embeddings using OpenAI API
128
+ embeddings = get_openai_embeddings(preprocessed_text)
129
+ if embeddings is not None:
130
+ print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
131
+ else:
132
+ print("Failed to generate embeddings.")