Spaces:

Jagukumar
/

Dynamic-chatbot-using-RAG

Build error

App Files Files Community

Jagukumar commited on Nov 27, 2024

Commit

11e725d

verified ·

1 Parent(s): 74a5040

Update processing.py

Browse files

Files changed (1) hide show

processing.py +132 -119

processing.py CHANGED Viewed

@@ -1,119 +1,132 @@
-import mimetypes
-import pandas as pd
-import PyPDF2
-import json
-import re
-import spacy
-import os
-from dotenv import load_dotenv
-import openai
-import numpy as np
-# Load environment variables
-load_dotenv()
-# Set OpenAI API Key
-openai.api_key = os.getenv("OPENAI_API_KEY")
-# Load SpaCy model
-nlp = spacy.load("en_core_web_sm")
-# Detect file type
-def detect_file_type(file_path):
-    file_type = mimetypes.guess_type(file_path)[0]
-    if file_type in ["application/pdf"]:
-        return "pdf"
-    elif file_type in ["text/csv", "application/vnd.ms-excel"]:
-        return "csv"
-    elif file_type == "application/json":
-        return "json"
-    else:
-        raise ValueError(f"Unsupported file format: {file_type}")
-# Extract text from CSV
-def extract_text_from_csv(file_path):
-    df = pd.read_csv(file_path)
-    text = " ".join(df.astype(str).stack())
-    return text
-# Extract text from PDF
-def extract_text_from_pdf(file_path):
-    pdf_reader = PyPDF2.PdfReader(file_path)
-    text = ""
-    for page in pdf_reader.pages:
-        text += page.extract_text()
-    return text
-# Extract text from JSON
-def extract_text_from_json(file_path):
-    def recursive_text_extraction(data):
-        if isinstance(data, dict):
-            return " ".join(recursive_text_extraction(value) for value in data.values())
-        elif isinstance(data, list):
-            return " ".join(recursive_text_extraction(item) for item in data)
-        else:
-            return str(data)
-    with open(file_path, 'r') as f:
-        data = json.load(f)
-    return recursive_text_extraction(data)
-# Generalized text extraction
-def extract_text(file_path):
-    file_type = detect_file_type(file_path)
-    if file_type == "csv":
-        return extract_text_from_csv(file_path)
-    elif file_type == "pdf":
-        return extract_text_from_pdf(file_path)
-    elif file_type == "json":
-        return extract_text_from_json(file_path)
-    else:
-        raise ValueError("Unsupported file format")
-# Preprocess text
-def preprocess_text_generalized(text):
-    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
-    text = re.sub(r"[^\x20-\x7E]", "", text)  # Remove non-ASCII characters
-    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
-    chunk_size = 100000  # Maximum chunk size
-    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
-    processed_chunks = []
-    for chunk in chunks:
-        doc = nlp(chunk.lower())
-        tokens = [
-            token.lemma_
-            for token in doc
-            if not token.is_stop and token.is_alpha
-        ]
-        processed_chunks.append(" ".join(tokens))
-    processed_text = " ".join(processed_chunks)
-    return processed_text
-# Generate embeddings using OpenAI API
-def get_openai_embeddings(text, model="text-embedding-ada-002"):
-    """
-    Generate embeddings for a given text using OpenAI API.
-    """
-    try:
-        response = openai.Embedding.create(input=text, model=model)
-        embeddings = response["data"][0]["embedding"]
-        return np.array(embeddings)  # Convert to NumPy array for compatibility
-    except Exception as e:
-        print(f"Error generating embeddings: {e}")
-        return None
-# Example usage
-if __name__ == "__main__":
-    # Example file path
-    file_path = "example.pdf"
-    # Extract and preprocess text
-    raw_text = extract_text(file_path)
-    preprocessed_text = preprocess_text_generalized(raw_text)
-    # Generate embeddings using OpenAI API
-    embeddings = get_openai_embeddings(preprocessed_text)
-    if embeddings is not None:
-        print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
-    else:
-        print("Failed to generate embeddings.")

+import mimetypes
+import pandas as pd
+import PyPDF2
+import json
+import re
+import spacy
+import os
+from dotenv import load_dotenv
+import openai
+import numpy as np
+# Load environment variables
+load_dotenv()
+# Set OpenAI API Key
+openai.api_key = os.getenv("OPENAI_API_KEY")
+# Load SpaCy model
+# nlp = spacy.load("en_core_web_sm")
+import spacy
+from spacy.cli import download
+# Ensure the model is available
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    print("Downloading SpaCy 'en_core_web_sm' model...")
+    download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
+# Detect file type
+def detect_file_type(file_path):
+    file_type = mimetypes.guess_type(file_path)[0]
+    if file_type in ["application/pdf"]:
+        return "pdf"
+    elif file_type in ["text/csv", "application/vnd.ms-excel"]:
+        return "csv"
+    elif file_type == "application/json":
+        return "json"
+    else:
+        raise ValueError(f"Unsupported file format: {file_type}")
+# Extract text from CSV
+def extract_text_from_csv(file_path):
+    df = pd.read_csv(file_path)
+    text = " ".join(df.astype(str).stack())
+    return text
+# Extract text from PDF
+def extract_text_from_pdf(file_path):
+    pdf_reader = PyPDF2.PdfReader(file_path)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+# Extract text from JSON
+def extract_text_from_json(file_path):
+    def recursive_text_extraction(data):
+        if isinstance(data, dict):
+            return " ".join(recursive_text_extraction(value) for value in data.values())
+        elif isinstance(data, list):
+            return " ".join(recursive_text_extraction(item) for item in data)
+        else:
+            return str(data)
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return recursive_text_extraction(data)
+# Generalized text extraction
+def extract_text(file_path):
+    file_type = detect_file_type(file_path)
+    if file_type == "csv":
+        return extract_text_from_csv(file_path)
+    elif file_type == "pdf":
+        return extract_text_from_pdf(file_path)
+    elif file_type == "json":
+        return extract_text_from_json(file_path)
+    else:
+        raise ValueError("Unsupported file format")
+# Preprocess text
+def preprocess_text_generalized(text):
+    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
+    text = re.sub(r"[^\x20-\x7E]", "", text)  # Remove non-ASCII characters
+    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
+    chunk_size = 100000  # Maximum chunk size
+    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+    processed_chunks = []
+    for chunk in chunks:
+        doc = nlp(chunk.lower())
+        tokens = [
+            token.lemma_
+            for token in doc
+            if not token.is_stop and token.is_alpha
+        ]
+        processed_chunks.append(" ".join(tokens))
+    processed_text = " ".join(processed_chunks)
+    return processed_text
+# Generate embeddings using OpenAI API
+def get_openai_embeddings(text, model="text-embedding-ada-002"):
+    """
+    Generate embeddings for a given text using OpenAI API.
+    """
+    try:
+        response = openai.Embedding.create(input=text, model=model)
+        embeddings = response["data"][0]["embedding"]
+        return np.array(embeddings)  # Convert to NumPy array for compatibility
+    except Exception as e:
+        print(f"Error generating embeddings: {e}")
+        return None
+# Example usage
+if __name__ == "__main__":
+    # Example file path
+    file_path = "example.pdf"
+    # Extract and preprocess text
+    raw_text = extract_text(file_path)
+    preprocessed_text = preprocess_text_generalized(raw_text)
+    # Generate embeddings using OpenAI API
+    embeddings = get_openai_embeddings(preprocessed_text)
+    if embeddings is not None:
+        print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
+    else:
+        print("Failed to generate embeddings.")