Spaces:

Zeri00
/

Cogni-chat-document-reader

Sleeping

App Files Files Community

riteshraut commited on Oct 12, 2025

Commit

2e541fd

1 Parent(s): 08eb411

Fix

Browse files

Files changed (1) hide show

app.py +54 -157

app.py CHANGED Viewed

@@ -3,39 +3,29 @@ import nltk
 from functools import wraps
 # ============================ NLTK MONKEY-PATCH (MUST BE FIRST) ============================
-# This is the definitive fix. We are intercepting the nltk.download function
-# to force it to always use the correct, writable directory.
 print("Applying NLTK monkey-patch...")
 NLTK_DATA_DIR = '/tmp/nltk_data'
 os.environ['NLTK_DATA'] = NLTK_DATA_DIR
 os.makedirs(NLTK_DATA_DIR, exist_ok=True)
-# Store the original download function
 _original_nltk_download = nltk.download
-# Create a new, patched download function
 @wraps(_original_nltk_download)
 def _patched_nltk_download(info_or_id, download_dir=None, **kwargs):
-    # If the download_dir is not specified (which is the case in the faulty
-    # 'unstructured' call), force it to our writable directory.
     if download_dir is None:
         download_dir = NLTK_DATA_DIR
     print(f"Patched NLTK download called for '{info_or_id}', ensuring download_dir='{download_dir}'")
     return _original_nltk_download(info_or_id, download_dir=download_dir, **kwargs)
-# Replace the original function with our patched version
 nltk.download = _patched_nltk_download
 print("NLTK monkey-patch applied successfully.")
 # ========================================================================================
-# Now that the patch is active, we can proceed with imports and initial downloads.
 print("Running initial NLTK downloads...")
 nltk.download('punkt')
 nltk.download('stopwords')
-nltk.download('averaged_perceptron_tagger_eng')
 print("Initial NLTK downloads complete.")
 import time
@@ -44,22 +34,21 @@ from flask import Flask, request, render_template, session, jsonify, Response, s
 from werkzeug.utils import secure_filename
 from rag_processor import create_rag_chain
-# ============================ ADDITIONS START ============================
 from gtts import gTTS
 import io
 import re
-# ============================ ADDITIONS END ==============================
-# NOW import the rest of the modules that might use NLTK
-# Document Loaders
 from langchain_community.document_loaders import (
     TextLoader,
-    UnstructuredPDFLoader,
     Docx2txtLoader,
-    UnstructuredImageLoader,
 )
-# Text Splitter, Embeddings, Retrievers
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
@@ -69,37 +58,29 @@ from langchain_community.chat_message_histories import ChatMessageHistory
 # --- Basic Flask App Setup ---
 app = Flask(__name__)
-# A secret key is needed for session management
-app.config['SECRET_KEY'] = os.urandom(24)
-# Configure the upload folder
 app.config['UPLOAD_FOLDER'] = '/tmp/uploads'
-# Ensure the upload folder exists
 os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
 # --- In-memory Storage & Global Model Loading ---
 rag_chains = {}
 message_histories = {}
-# Load the embedding model once when the application starts for efficiency.
 print("Loading embedding model...")
 EMBEDDING_MODEL = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 print("Embedding model loaded successfully.")
-# A dictionary to map file extensions to their corresponding loader classes
 LOADER_MAPPING = {
     ".txt": TextLoader,
-    ".pdf": UnstructuredPDFLoader,
     ".docx": Docx2txtLoader,
-    ".jpeg": UnstructuredImageLoader,
-    ".jpg": UnstructuredImageLoader,
-    ".png": UnstructuredImageLoader,
 }
 def get_session_history(session_id: str) -> ChatMessageHistory:
-    """
-    Retrieves the chat history for a given session ID. If it doesn't exist,
-    a new history object is created.
-    """
     if session_id not in message_histories:
         message_histories[session_id] = ChatMessageHistory()
     return message_histories[session_id]
@@ -111,23 +92,18 @@ def index():
 @app.route('/upload', methods=['POST'])
 def upload_files():
-    """Handles multiple file uploads, processing, and RAG chain creation."""
-    # Ensure NLTK is still configured correctly
-    if 'NLTK_DATA' not in os.environ:
-        os.environ['NLTK_DATA'] = '/tmp/nltk_data'
     files = request.files.getlist('file')
     if not files or all(f.filename == '' for f in files):
         return jsonify({'status': 'error', 'message': 'No selected files.'}), 400
     all_docs = []
     all_filenames = []
     failed_files = []
     try:
-        print(f"Processing {len(files)} files...")
         for file in files:
             if file and file.filename:
                 filename = secure_filename(file.filename)
@@ -136,159 +112,90 @@ def upload_files():
                 try:
                     file.save(filepath)
-                    print(f"Saved file: {filename} at {filepath}")
                     file_extension = os.path.splitext(filename)[1].lower()
-                    if file_extension not in LOADER_MAPPING:
-                        print(f"Skipping unsupported file type: {filename}")
-                        failed_files.append(f"{filename} (unsupported format)")
-                        continue
-                    loader_class = LOADER_MAPPING[file_extension]
-                    loader_kwargs = {}
-                    if file_extension in [".jpeg", ".jpg", ".png"]:
-                        loader_kwargs['mode'] = 'single'
-                    elif file_extension == ".pdf":
-                        loader_kwargs['strategy'] = 'hi_res'
-                        loader_kwargs['languages'] = ['eng']
-                    print(f"Loading {filename} with {loader_class.__name__}...")
-                    loader = loader_class(filepath, **loader_kwargs)
-                    loaded_docs = loader.load()
-                    # Check if documents were actually loaded
-                    if loaded_docs:
-                        print(f"Successfully loaded {len(loaded_docs)} documents from {filename}")
-                        # Check if the documents have content
-                        for doc in loaded_docs:
-                            if hasattr(doc, 'page_content') and doc.page_content:
-                                print(f"Document content preview (first 100 chars): {doc.page_content[:100]}")
-                            else:
-                                print(f"Warning: Document from {filename} has no content")
-                        all_docs.extend(loaded_docs)
-                    else:
-                        print(f"Warning: No documents loaded from {filename}")
-                        failed_files.append(f"{filename} (no content extracted)")
                 except Exception as e:
                     print(f"Error processing file {filename}: {e}")
-                    failed_files.append(f"{filename} (processing error: {str(e)})")
                     continue
         if not all_docs:
-            error_msg = "No processable content was extracted from the uploaded files."
-            if failed_files:
-                error_msg += f" Failed files: {', '.join(failed_files)}"
-            print(error_msg)
             return jsonify({'status': 'error', 'message': error_msg}), 400
-        # --- Process all documents together ---
-        print(f"Total documents loaded: {len(all_docs)}")
-        # Split documents into chunks
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,
-            chunk_overlap=200,
-            length_function=len,
-            separators=["\n\n", "\n", " ", ""]
-        )
         splits = text_splitter.split_documents(all_docs)
-        print(f"Documents split into {len(splits)} chunks")
-        # Verify that splits have content
         if not splits:
             return jsonify({
                 'status': 'error',
-                'message': 'Documents were loaded but no text chunks were created. The files might be empty or contain only non-text content.'
             }), 400
-        # Additional check for empty chunks
-        non_empty_splits = [s for s in splits if s.page_content and s.page_content.strip()]
-        if not non_empty_splits:
-            return jsonify({
-                'status': 'error',
-                'message': 'All text chunks are empty. Please check if your files contain readable text.'
-            }), 400
-        if len(non_empty_splits) < len(splits):
-            print(f"Warning: {len(splits) - len(non_empty_splits)} empty chunks were filtered out")
-            splits = non_empty_splits
-        print(f"Creating vector store with {len(splits)} non-empty chunks...")
-        try:
-            vectorstore = FAISS.from_documents(documents=splits, embedding=EMBEDDING_MODEL)
-            print("Vector store created successfully")
-        except IndexError as e:
-            print(f"IndexError creating vector store: {e}")
-            return jsonify({
-                'status': 'error',
-                'message': 'Failed to create embeddings. The documents might not contain enough text content.'
-            }), 500
-        except Exception as e:
-            print(f"Error creating vector store: {e}")
-            return jsonify({
-                'status': 'error',
-                'message': f'Failed to create vector store: {str(e)}'
-            }), 500
-        # Create retrievers
-        print("Creating BM25 retriever...")
         bm25_retriever = BM25Retriever.from_documents(splits)
         bm25_retriever.k = 5
-        print("Creating FAISS retriever...")
         faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
-        print("Creating ensemble retriever...")
-        ensemble_retriever = EnsembleRetriever(
-            retrievers=[bm25_retriever, faiss_retriever],
-            weights=[0.5, 0.5]
-        )
-        # Create session and RAG chain
         session_id = str(uuid.uuid4())
         rag_chains[session_id] = create_rag_chain(ensemble_retriever, get_session_history)
-        print(f"RAG chain created for session {session_id} with {len(all_filenames)} documents.")
         session['session_id'] = session_id
-        # Prepare response
-        display_filenames = ", ".join(all_filenames)
-        response_data = {'status': 'success', 'filename': display_filenames}
         if failed_files:
             response_data['warnings'] = f"Some files could not be processed: {', '.join(failed_files)}"
         return jsonify(response_data)
     except Exception as e:
-        print(f"Unexpected error creating RAG chain: {e}")
         import traceback
         traceback.print_exc()
-        return jsonify({'status': 'error', 'message': f'Failed to process files: {str(e)}'}), 500
 @app.route('/chat', methods=['POST'])
 def chat():
-    """Handles chat messages and streams the response with memory."""
     data = request.get_json()
     question = data.get('question')
     session_id = session.get('session_id')
-    if not all([question, session_id]):
-        return jsonify({'status': 'error', 'message': 'Missing data in request.'}), 400
-    if session_id not in rag_chains:
-        return jsonify({'status': 'error', 'message': 'Session not found. Please upload documents again.'}), 400
     try:
         rag_chain = rag_chains[session_id]
         config = {"configurable": {"session_id": session_id}}
         def generate():
-            """A generator function to stream the response."""
             for chunk in rag_chain.stream({"question": question, "config": config}):
                 yield chunk
@@ -298,29 +205,22 @@ def chat():
         print(f"Error during chat invocation: {e}")
         return Response("An error occurred while getting the answer.", status=500, mimetype='text/plain')
-# ============================ ADDITIONS START ============================
 def clean_markdown_for_tts(text: str) -> str:
-    """Removes markdown formatting for cleaner text-to-speech output."""
-    # Remove bold (**text**) and italics (*text* or _text_)
     text = re.sub(r'\*(\*?)(.*?)\1\*', r'\2', text)
     text = re.sub(r'\_(.*?)\_', r'\1', text)
-    # Remove inline code (`code`)
     text = re.sub(r'`(.*?)`', r'\1', text)
-    # Remove headings (e.g., #, ##, ###)
     text = re.sub(r'^\s*#{1,6}\s+', '', text, flags=re.MULTILINE)
-    # Remove list item markers (*, -, 1.)
     text = re.sub(r'^\s*[\*\-]\s+', '', text, flags=re.MULTILINE)
     text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
-    # Remove blockquotes (>)
     text = re.sub(r'^\s*>\s?', '', text, flags=re.MULTILINE)
-    # Replace multiple newlines with a single space
     text = re.sub(r'\n+', ' ', text)
     return text.strip()
 @app.route('/tts', methods=['POST'])
 def text_to_speech():
-    """Generates audio from text and returns it as an MP3 stream."""
     data = request.get_json()
     text = data.get('text')
@@ -328,9 +228,7 @@ def text_to_speech():
         return jsonify({'status': 'error', 'message': 'No text provided.'}), 400
     try:
-        # Clean the text before sending to gTTS
         clean_text = clean_markdown_for_tts(text)
         tts = gTTS(clean_text, lang='en')
         mp3_fp = io.BytesIO()
         tts.write_to_fp(mp3_fp)
@@ -339,7 +237,6 @@ def text_to_speech():
     except Exception as e:
         print(f"Error in TTS generation: {e}")
         return jsonify({'status': 'error', 'message': 'Failed to generate audio.'}), 500
-# ============================ ADDITIONS END ==============================
 if __name__ == '__main__':
     app.run(debug=True, port=5001)

 from functools import wraps
 # ============================ NLTK MONKEY-PATCH (MUST BE FIRST) ============================
+# This patch ensures NLTK downloads to a writable directory on platforms like Hugging Face Spaces.
 print("Applying NLTK monkey-patch...")
 NLTK_DATA_DIR = '/tmp/nltk_data'
 os.environ['NLTK_DATA'] = NLTK_DATA_DIR
 os.makedirs(NLTK_DATA_DIR, exist_ok=True)
 _original_nltk_download = nltk.download
 @wraps(_original_nltk_download)
 def _patched_nltk_download(info_or_id, download_dir=None, **kwargs):
     if download_dir is None:
         download_dir = NLTK_DATA_DIR
     print(f"Patched NLTK download called for '{info_or_id}', ensuring download_dir='{download_dir}'")
     return _original_nltk_download(info_or_id, download_dir=download_dir, **kwargs)
 nltk.download = _patched_nltk_download
 print("NLTK monkey-patch applied successfully.")
 # ========================================================================================
+# Now that the patch is active, we can proceed with initial downloads.
 print("Running initial NLTK downloads...")
 nltk.download('punkt')
 nltk.download('stopwords')
 print("Initial NLTK downloads complete.")
 import time
 from werkzeug.utils import secure_filename
 from rag_processor import create_rag_chain
+# --- Text-to-Speech Additions ---
 from gtts import gTTS
 import io
 import re
+# --- MODIFIED: Lightweight Document Loaders ---
+# We are only importing loaders for text-based files to keep the app lightweight.
+# PyPDFLoader is used for text-based PDFs. Unstructured loaders for images are removed.
 from langchain_community.document_loaders import (
     TextLoader,
     Docx2txtLoader,
+    PyPDFLoader, # Lightweight PDF loader
 )
+# --- Standard LangChain Components ---
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 # --- Basic Flask App Setup ---
 app = Flask(__name__)
+app.config['SECRET_KEY'] = os.urandom(24)
 app.config['UPLOAD_FOLDER'] = '/tmp/uploads'
 os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
 # --- In-memory Storage & Global Model Loading ---
 rag_chains = {}
 message_histories = {}
+# The 'all-MiniLM-L6-v2' model is already a great lightweight choice. No changes needed here.
 print("Loading embedding model...")
 EMBEDDING_MODEL = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 print("Embedding model loaded successfully.")
+# --- MODIFIED: Lightweight Loader Mapping ---
+# This mapping now only includes loaders for text-based files.
 LOADER_MAPPING = {
     ".txt": TextLoader,
+    ".pdf": PyPDFLoader,
     ".docx": Docx2txtLoader,
 }
 def get_session_history(session_id: str) -> ChatMessageHistory:
+    """Retrieves or creates a chat history for a given session ID."""
     if session_id not in message_histories:
         message_histories[session_id] = ChatMessageHistory()
     return message_histories[session_id]
 @app.route('/upload', methods=['POST'])
 def upload_files():
+    """Handles file uploads using a lightweight, text-only processing strategy."""
     files = request.files.getlist('file')
     if not files or all(f.filename == '' for f in files):
         return jsonify({'status': 'error', 'message': 'No selected files.'}), 400
     all_docs = []
     all_filenames = []
     failed_files = []
     try:
+        print(f"Processing {len(files)} files with a lightweight strategy...")
         for file in files:
             if file and file.filename:
                 filename = secure_filename(file.filename)
                 try:
                     file.save(filepath)
+                    print(f"Saved file: {filename}")
                     file_extension = os.path.splitext(filename)[1].lower()
+                    # --- REVISED: Simplified Loading Logic ---
+                    if file_extension in LOADER_MAPPING:
+                        loader_class = LOADER_MAPPING[file_extension]
+                        print(f"Loading {filename} with {loader_class.__name__}...")
+                        loader = loader_class(filepath)
+                        loaded_docs = loader.load()
+                        # Crucial Check: Ensure content was actually extracted.
+                        # This is important for scanned PDFs, where PyPDFLoader will produce no text.
+                        if loaded_docs and any(doc.page_content.strip() for doc in loaded_docs):
+                            all_docs.extend(loaded_docs)
+                        else:
+                            print(f"Warning: No text content found in {filename}. It might be empty or image-based.")
+                            failed_files.append(f"{filename} (no text found)")
+                    else:
+                        print(f"Skipping unsupported file type: {filename}")
+                        failed_files.append(f"{filename} (unsupported format)")
                 except Exception as e:
                     print(f"Error processing file {filename}: {e}")
+                    failed_files.append(f"{filename} (processing error)")
                     continue
         if not all_docs:
+            error_msg = "No processable text content was extracted from the uploaded files. Please ensure files are not empty, corrupted, or image-based."
             return jsonify({'status': 'error', 'message': error_msg}), 400
+        # --- Process all documents together (No changes from here on) ---
+        print(f"Total documents with text loaded: {len(all_docs)}")
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
         splits = text_splitter.split_documents(all_docs)
         if not splits:
             return jsonify({
                 'status': 'error',
+                'message': 'Loaded documents but could not create text chunks. Check file content.'
             }), 400
+        print(f"Documents split into {len(splits)} chunks. Creating vector store...")
+        vectorstore = FAISS.from_documents(documents=splits, embedding=EMBEDDING_MODEL)
         bm25_retriever = BM25Retriever.from_documents(splits)
         bm25_retriever.k = 5
         faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
+        ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5])
         session_id = str(uuid.uuid4())
         rag_chains[session_id] = create_rag_chain(ensemble_retriever, get_session_history)
         session['session_id'] = session_id
+        print(f"RAG chain created for session {session_id}.")
+        response_data = {'status': 'success', 'filename': ", ".join(all_filenames)}
         if failed_files:
             response_data['warnings'] = f"Some files could not be processed: {', '.join(failed_files)}"
         return jsonify(response_data)
     except Exception as e:
         import traceback
         traceback.print_exc()
+        return jsonify({'status': 'error', 'message': f'An unexpected error occurred: {str(e)}'}), 500
 @app.route('/chat', methods=['POST'])
 def chat():
+    """Handles chat messages and streams the response."""
     data = request.get_json()
     question = data.get('question')
     session_id = session.get('session_id')
+    if not all([question, session_id]) or session_id not in rag_chains:
+        return jsonify({'status': 'error', 'message': 'Session not found or invalid. Please upload documents again.'}), 400
     try:
         rag_chain = rag_chains[session_id]
         config = {"configurable": {"session_id": session_id}}
         def generate():
             for chunk in rag_chain.stream({"question": question, "config": config}):
                 yield chunk
         print(f"Error during chat invocation: {e}")
         return Response("An error occurred while getting the answer.", status=500, mimetype='text/plain')
+# ============================ Text-to-Speech Functions ============================
 def clean_markdown_for_tts(text: str) -> str:
+    """Removes markdown for cleaner text-to-speech output."""
     text = re.sub(r'\*(\*?)(.*?)\1\*', r'\2', text)
     text = re.sub(r'\_(.*?)\_', r'\1', text)
     text = re.sub(r'`(.*?)`', r'\1', text)
     text = re.sub(r'^\s*#{1,6}\s+', '', text, flags=re.MULTILINE)
     text = re.sub(r'^\s*[\*\-]\s+', '', text, flags=re.MULTILINE)
     text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
     text = re.sub(r'^\s*>\s?', '', text, flags=re.MULTILINE)
     text = re.sub(r'\n+', ' ', text)
     return text.strip()
 @app.route('/tts', methods=['POST'])
 def text_to_speech():
+    """Generates audio from text."""
     data = request.get_json()
     text = data.get('text')
         return jsonify({'status': 'error', 'message': 'No text provided.'}), 400
     try:
         clean_text = clean_markdown_for_tts(text)
         tts = gTTS(clean_text, lang='en')
         mp3_fp = io.BytesIO()
         tts.write_to_fp(mp3_fp)
     except Exception as e:
         print(f"Error in TTS generation: {e}")
         return jsonify({'status': 'error', 'message': 'Failed to generate audio.'}), 500
 if __name__ == '__main__':
     app.run(debug=True, port=5001)