Spaces:

Phoenix21
/

DailyWellnessMVPchatbot

Sleeping

App Files Files Community

Phoenix21 commited on Dec 10, 2024

Commit

4583bb0

1 Parent(s): 1cfc91b

handled punkt error2

Browse files

Files changed (1) hide show

app.py +89 -37

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import logging
 import re
 import nltk
 import spacy
 from nltk.tokenize import sent_tokenize
 from langchain.vectorstores import Chroma
 from langchain_core.output_parsers import StrOutputParser
@@ -18,18 +19,6 @@ import gradio as gr
 import pandas as pd
 import json
-# Download required NLTK resources
-nltk.download('punkt')
-# Load spaCy English model
-try:
-    nlp = spacy.load("en_core_web_sm")
-except OSError:
-    # If the model is not found, download it
-    from spacy.cli import download
-    download("en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm")
 # Enable logging for debugging
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
@@ -49,6 +38,32 @@ api_key = clean_api_key(api_key).strip()  # Clean and strip whitespace
 def clean_text(text):
     return text.encode("ascii", errors="ignore").decode()
 # Function to load and clean documents from multiple file formats
 def load_documents(file_paths):
     docs = []
@@ -84,14 +99,32 @@ def load_documents(file_paths):
                 logger.warning(f"Unsupported file format: {file_path}")
         except Exception as e:
             logger.error(f"Error processing file {file_path}: {e}")
     return docs
 # Function to ensure the response ends with complete sentences using NLTK
 def ensure_complete_sentences(text):
-    sentences = sent_tokenize(text)
-    if sentences:
-        return ' '.join(sentences).strip()
-    return text  # Return as is if no complete sentence is found
 # Advanced input validation using spaCy (Section 8a)
 def is_valid_input_nlp(text, threshold=0.5):
@@ -106,12 +139,15 @@ def is_valid_input_nlp(text, threshold=0.5):
     - bool: True if the input is valid, False otherwise.
     """
     if not text or text.strip() == "":
         return False
     doc = nlp(text)
     meaningful_tokens = [token for token in doc if token.is_alpha]
     if not meaningful_tokens:
         return False
     ratio = len(meaningful_tokens) / len(doc)
     return ratio >= threshold
 # Function to estimate prompt tokens (simple word count approximation)
@@ -133,9 +169,11 @@ def initialize_llm(model, temperature, max_tokens, prompt_template):
     try:
         # Estimate prompt tokens
         estimated_prompt_tokens = estimate_prompt_tokens(prompt_template)
         # Allocate remaining tokens to response
         response_max_tokens = max_tokens - estimated_prompt_tokens
         if response_max_tokens <= 100:
             raise ValueError("max_tokens is too small to allocate for the response.")
@@ -150,7 +188,8 @@ def initialize_llm(model, temperature, max_tokens, prompt_template):
         return llm
     except Exception as e:
         logger.error(f"Error initializing LLM: {e}")
-        raise
 # Create the RAG pipeline
 def create_rag_pipeline(file_paths, model, temperature, max_tokens):
@@ -173,6 +212,7 @@ def create_rag_pipeline(file_paths, model, temperature, max_tokens):
         # Estimate prompt tokens
         estimated_prompt_tokens = estimate_prompt_tokens(custom_prompt_template.template)
         # Initialize the LLM with token allocation
         llm = initialize_llm(model, temperature, max_tokens, custom_prompt_template.template)
@@ -186,15 +226,17 @@ def create_rag_pipeline(file_paths, model, temperature, max_tokens):
         # Split documents into chunks
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
         splits = text_splitter.split_documents(docs)
         # Initialize the embedding model
         embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-        # Use a persistent database for Chroma
         vectorstore = Chroma.from_documents(
             documents=splits,
             embedding=embedding_model,
-            persist_directory="./chroma_db"  # Specify persistent storage directory
         )
         vectorstore.persist()  # Save the database to disk
         logger.debug("Vectorstore initialized and persisted successfully.")
@@ -212,6 +254,7 @@ def create_rag_pipeline(file_paths, model, temperature, max_tokens):
         return rag_chain, "Pipeline created successfully."
     except Exception as e:
         logger.error(f"Error creating RAG pipeline: {e}")
         return None, f"Error creating RAG pipeline: {e}"
 # Function to handle feedback (Section 8d)
@@ -235,27 +278,36 @@ def handle_feedback(feedback_text):
 # Function to answer questions with input validation and post-processing
 def answer_question(file_paths, model, temperature, max_tokens, question, feedback):
-    # Validate input using spaCy-based validation
-    if not is_valid_input_nlp(question):
-        return "Please provide a valid question or input containing meaningful text.", ""
-    rag_chain, message = create_rag_pipeline(file_paths, model, temperature, max_tokens)
-    if rag_chain is None:
-        return message, ""
     try:
-        answer = rag_chain.run(question)
-        logger.debug("Question answered successfully.")
-        # Post-process to ensure the answer ends with complete sentences
-        complete_answer = ensure_complete_sentences(answer)
-        # Handle feedback
-        feedback_response = handle_feedback(feedback)
-        return complete_answer, feedback_response
-    except Exception as e:
-        logger.error(f"Error during RAG pipeline execution: {e}")
-        return f"Error during RAG pipeline execution: {e}", ""
 # Gradio Interface with Feedback Mechanism (Section 8d)
 def gradio_interface(model, temperature, max_tokens, question, feedback):

 import re
 import nltk
 import spacy
+import traceback
 from nltk.tokenize import sent_tokenize
 from langchain.vectorstores import Chroma
 from langchain_core.output_parsers import StrOutputParser
 import pandas as pd
 import json
 # Enable logging for debugging
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 def clean_text(text):
     return text.encode("ascii", errors="ignore").decode()
+# Download required NLTK resources
+try:
+    nltk.download('punkt', download_dir='/tmp/nltk_data')
+    nltk.data.path.append('/tmp/nltk_data')
+    logger.debug("NLTK 'punkt' resource downloaded successfully.")
+except Exception as e:
+    logger.error("Failed to download NLTK 'punkt' resource.")
+    logger.error(traceback.format_exc())
+    raise e
+# Load spaCy English model
+try:
+    nlp = spacy.load("en_core_web_sm")
+    logger.debug("spaCy 'en_core_web_sm' model loaded successfully.")
+except OSError:
+    try:
+        logger.debug("spaCy model not found. Downloading 'en_core_web_sm'.")
+        from spacy.cli import download
+        download("en_core_web_sm")
+        nlp = spacy.load("en_core_web_sm")
+        logger.debug("spaCy 'en_core_web_sm' model downloaded and loaded successfully.")
+    except Exception as e:
+        logger.error("Failed to download and load spaCy 'en_core_web_sm' model.")
+        logger.error(traceback.format_exc())
+        raise e
 # Function to load and clean documents from multiple file formats
 def load_documents(file_paths):
     docs = []
                 logger.warning(f"Unsupported file format: {file_path}")
         except Exception as e:
             logger.error(f"Error processing file {file_path}: {e}")
+            logger.error(traceback.format_exc())
     return docs
 # Function to ensure the response ends with complete sentences using NLTK
 def ensure_complete_sentences(text):
+    logger.debug("Ensuring complete sentences for the given text.")
+    try:
+        sentences = sent_tokenize(text)
+        if sentences:
+            return ' '.join(sentences).strip()
+        return text  # Return as is if no complete sentence is found
+    except LookupError as e:
+        logger.error("NLTK resource 'punkt' not found. Attempting to download again.")
+        try:
+            nltk.download('punkt', download_dir='/tmp/nltk_data')
+            nltk.data.path.append('/tmp/nltk_data')
+            sentences = sent_tokenize(text)
+            return ' '.join(sentences).strip()
+        except Exception as e_inner:
+            logger.error("Failed to download 'punkt' resource.")
+            logger.error(traceback.format_exc())
+            raise e_inner
+    except Exception as e:
+        logger.error("Unexpected error during sentence tokenization.")
+        logger.error(traceback.format_exc())
+        raise e
 # Advanced input validation using spaCy (Section 8a)
 def is_valid_input_nlp(text, threshold=0.5):
     - bool: True if the input is valid, False otherwise.
     """
     if not text or text.strip() == "":
+        logger.debug("Input text is empty or contains only whitespace.")
         return False
     doc = nlp(text)
     meaningful_tokens = [token for token in doc if token.is_alpha]
     if not meaningful_tokens:
+        logger.debug("No meaningful (alphabetic) tokens found in input.")
         return False
     ratio = len(meaningful_tokens) / len(doc)
+    logger.debug(f"Meaningful tokens ratio: {ratio}")
     return ratio >= threshold
 # Function to estimate prompt tokens (simple word count approximation)
     try:
         # Estimate prompt tokens
         estimated_prompt_tokens = estimate_prompt_tokens(prompt_template)
+        logger.debug(f"Estimated prompt tokens: {estimated_prompt_tokens}")
         # Allocate remaining tokens to response
         response_max_tokens = max_tokens - estimated_prompt_tokens
+        logger.debug(f"Response max tokens: {response_max_tokens}")
         if response_max_tokens <= 100:
             raise ValueError("max_tokens is too small to allocate for the response.")
         return llm
     except Exception as e:
         logger.error(f"Error initializing LLM: {e}")
+        logger.error(traceback.format_exc())
+        raise e
 # Create the RAG pipeline
 def create_rag_pipeline(file_paths, model, temperature, max_tokens):
         # Estimate prompt tokens
         estimated_prompt_tokens = estimate_prompt_tokens(custom_prompt_template.template)
+        logger.debug(f"Estimated prompt tokens from template: {estimated_prompt_tokens}")
         # Initialize the LLM with token allocation
         llm = initialize_llm(model, temperature, max_tokens, custom_prompt_template.template)
         # Split documents into chunks
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
         splits = text_splitter.split_documents(docs)
+        logger.debug(f"Documents split into {len(splits)} chunks.")
         # Initialize the embedding model
         embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+        logger.debug("Embedding model initialized successfully.")
+        # Use a temporary directory for Chroma vectorstore to prevent caching issues on Hugging Face Spaces
         vectorstore = Chroma.from_documents(
             documents=splits,
             embedding=embedding_model,
+            persist_directory="/tmp/chroma_db"  # Temporary storage directory
         )
         vectorstore.persist()  # Save the database to disk
         logger.debug("Vectorstore initialized and persisted successfully.")
         return rag_chain, "Pipeline created successfully."
     except Exception as e:
         logger.error(f"Error creating RAG pipeline: {e}")
+        logger.error(traceback.format_exc())
         return None, f"Error creating RAG pipeline: {e}"
 # Function to handle feedback (Section 8d)
 # Function to answer questions with input validation and post-processing
 def answer_question(file_paths, model, temperature, max_tokens, question, feedback):
     try:
+        # Validate input using spaCy-based validation
+        if not is_valid_input_nlp(question):
+            logger.debug("Invalid input detected.")
+            return "Please provide a valid question or input containing meaningful text.", ""
+        rag_chain, message = create_rag_pipeline(file_paths, model, temperature, max_tokens)
+        if rag_chain is None:
+            logger.debug("RAG pipeline creation failed.")
+            return message, ""
+        try:
+            answer = rag_chain.run(question)
+            logger.debug("Question answered successfully.")
+            # Post-process to ensure the answer ends with complete sentences
+            complete_answer = ensure_complete_sentences(answer)
+            # Handle feedback
+            feedback_response = handle_feedback(feedback)
+            return complete_answer, feedback_response
+        except Exception as e_inner:
+            logger.error(f"Error during RAG pipeline execution: {e_inner}")
+            logger.error(traceback.format_exc())
+            return f"Error during RAG pipeline execution: {e_inner}", ""
+    except Exception as e_outer:
+        logger.error(f"Unexpected error in answer_question: {e_outer}")
+        logger.error(traceback.format_exc())
+        return f"Unexpected error: {e_outer}", ""
 # Gradio Interface with Feedback Mechanism (Section 8d)
 def gradio_interface(model, temperature, max_tokens, question, feedback):