File size: 3,970 Bytes
11e725d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d286924
11e725d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import mimetypes
import pandas as pd
import PyPDF2
import json
import re
import spacy
import os
from dotenv import load_dotenv
import openai
import numpy as np

# Load environment variables
load_dotenv()

# Set OpenAI API Key

openai.api_key = os.getenv("OPENAI_API_KEY")

# Load SpaCy model
# nlp = spacy.load("en_core_web_sm")


import spacy
from spacy.cli import download

# Ensure the model is available
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading SpaCy 'en_core_web_sm' model...")
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")


# Detect file type
def detect_file_type(file_path):
    file_type = mimetypes.guess_type(file_path)[0]
    if file_type in ["application/pdf"]:
        return "pdf"
    elif file_type in ["text/csv", "application/vnd.ms-excel"]:
        return "csv"
    elif file_type == "application/json":
        return "json"
    else:
        raise ValueError(f"Unsupported file format: {file_type}")

# Extract text from CSV
def extract_text_from_csv(file_path):
    df = pd.read_csv(file_path)
    text = " ".join(df.astype(str).stack())
    return text

# Extract text from PDF
def extract_text_from_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Extract text from JSON
def extract_text_from_json(file_path):
    def recursive_text_extraction(data):
        if isinstance(data, dict):
            return " ".join(recursive_text_extraction(value) for value in data.values())
        elif isinstance(data, list):
            return " ".join(recursive_text_extraction(item) for item in data)
        else:
            return str(data)

    with open(file_path, 'r') as f:
        data = json.load(f)
    return recursive_text_extraction(data)

# Generalized text extraction
def extract_text(file_path):
    file_type = detect_file_type(file_path)
    if file_type == "csv":
        return extract_text_from_csv(file_path)
    elif file_type == "pdf":
        return extract_text_from_pdf(file_path)
    elif file_type == "json":
        return extract_text_from_json(file_path)
    else:
        raise ValueError("Unsupported file format")

# Preprocess text
def preprocess_text_generalized(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\x20-\x7E]", "", text)  # Remove non-ASCII characters
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    chunk_size = 100000  # Maximum chunk size
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    processed_chunks = []
    for chunk in chunks:
        doc = nlp(chunk.lower())
        tokens = [
            token.lemma_
            for token in doc
            if not token.is_stop and token.is_alpha
        ]
        processed_chunks.append(" ".join(tokens))
    processed_text = " ".join(processed_chunks)
    return processed_text

# Generate embeddings using OpenAI API
def get_openai_embeddings(text, model="text-embedding-ada-002"):
    """
    Generate embeddings for a given text using OpenAI API.
    """
    try:
        response = openai.Embedding.create(input=text, model=model)
        embeddings = response["data"][0]["embedding"]
        return np.array(embeddings)  # Convert to NumPy array for compatibility
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return None

# Example usage
if __name__ == "__main__":
    # Example file path
    file_path = "example.pdf"

    # Extract and preprocess text
    raw_text = extract_text(file_path)
    preprocessed_text = preprocess_text_generalized(raw_text)

    # Generate embeddings using OpenAI API
    embeddings = get_openai_embeddings(preprocessed_text)
    if embeddings is not None:
        print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
    else:
        print("Failed to generate embeddings.")