Spaces:
Sleeping
Sleeping
File size: 3,970 Bytes
11e725d d286924 11e725d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import mimetypes
import pandas as pd
import PyPDF2
import json
import re
import spacy
import os
from dotenv import load_dotenv
import openai
import numpy as np
# Load environment variables
load_dotenv()
# Set OpenAI API Key
openai.api_key = os.getenv("OPENAI_API_KEY")
# Load SpaCy model
# nlp = spacy.load("en_core_web_sm")
import spacy
from spacy.cli import download
# Ensure the model is available
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
print("Downloading SpaCy 'en_core_web_sm' model...")
download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
# Detect file type
def detect_file_type(file_path):
file_type = mimetypes.guess_type(file_path)[0]
if file_type in ["application/pdf"]:
return "pdf"
elif file_type in ["text/csv", "application/vnd.ms-excel"]:
return "csv"
elif file_type == "application/json":
return "json"
else:
raise ValueError(f"Unsupported file format: {file_type}")
# Extract text from CSV
def extract_text_from_csv(file_path):
df = pd.read_csv(file_path)
text = " ".join(df.astype(str).stack())
return text
# Extract text from PDF
def extract_text_from_pdf(file_path):
pdf_reader = PyPDF2.PdfReader(file_path)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Extract text from JSON
def extract_text_from_json(file_path):
def recursive_text_extraction(data):
if isinstance(data, dict):
return " ".join(recursive_text_extraction(value) for value in data.values())
elif isinstance(data, list):
return " ".join(recursive_text_extraction(item) for item in data)
else:
return str(data)
with open(file_path, 'r') as f:
data = json.load(f)
return recursive_text_extraction(data)
# Generalized text extraction
def extract_text(file_path):
file_type = detect_file_type(file_path)
if file_type == "csv":
return extract_text_from_csv(file_path)
elif file_type == "pdf":
return extract_text_from_pdf(file_path)
elif file_type == "json":
return extract_text_from_json(file_path)
else:
raise ValueError("Unsupported file format")
# Preprocess text
def preprocess_text_generalized(text):
text = re.sub(r"http\S+|www\S+|https\S+", "", text) # Remove URLs
text = re.sub(r"[^\x20-\x7E]", "", text) # Remove non-ASCII characters
text = re.sub(r"\s+", " ", text) # Normalize whitespace
chunk_size = 100000 # Maximum chunk size
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
processed_chunks = []
for chunk in chunks:
doc = nlp(chunk.lower())
tokens = [
token.lemma_
for token in doc
if not token.is_stop and token.is_alpha
]
processed_chunks.append(" ".join(tokens))
processed_text = " ".join(processed_chunks)
return processed_text
# Generate embeddings using OpenAI API
def get_openai_embeddings(text, model="text-embedding-ada-002"):
"""
Generate embeddings for a given text using OpenAI API.
"""
try:
response = openai.Embedding.create(input=text, model=model)
embeddings = response["data"][0]["embedding"]
return np.array(embeddings) # Convert to NumPy array for compatibility
except Exception as e:
print(f"Error generating embeddings: {e}")
return None
# Example usage
if __name__ == "__main__":
# Example file path
file_path = "example.pdf"
# Extract and preprocess text
raw_text = extract_text(file_path)
preprocessed_text = preprocess_text_generalized(raw_text)
# Generate embeddings using OpenAI API
embeddings = get_openai_embeddings(preprocessed_text)
if embeddings is not None:
print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
else:
print("Failed to generate embeddings.")
|