Final_Assignment_Template

Sleeping

App Files Files Community

ChillThrills commited on May 13, 2025

Commit

b96a1eb

1 Parent(s): 4e5a109

add new dependencies: transformers, torch, librosa, openpyxl, and pdfplumber

Browse files

Files changed (2) hide show

app.py +172 -50
requirements.txt +6 -1

app.py CHANGED Viewed

@@ -47,6 +47,34 @@ except ImportError:
     TavilyClient = None
     print("WARNING: tavily-python library not found. Tavily search provider will be unavailable. Install: pip install tavily-python")
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
@@ -64,9 +92,13 @@ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 AGENT_DEFAULT_TIMEOUT = 15
 MAX_CONTEXT_LENGTH_LLM = 15000
-MAX_FILE_SIZE = 5 * 1024 * 1024
 CSV_SAMPLE_ROWS = 3
-MAX_FILE_CONTEXT_LENGTH = 7000
 DEFAULT_RAG_CONFIG = {
     'search': {
@@ -98,6 +130,25 @@ DEFAULT_RAG_CONFIG = {
 }
 class FileProcessor:
     @staticmethod
     def _get_filename_from_url(url_or_header: Optional[str]) -> str:
         if not url_or_header: return "unknown_file"
@@ -112,22 +163,34 @@ class FileProcessor:
     @staticmethod
     def process(content: bytes, filename: Optional[str] = "unknown_file", content_type: Optional[str] = "") -> str:
         content_type_str = content_type.lower() if content_type else ""
-        filename_str = filename if filename else "unknown_file"
         try:
             if len(content) > MAX_FILE_SIZE:
                 gaia_logger.warning(f"File '{filename_str}' exceeds max size {MAX_FILE_SIZE} bytes.")
                 return f"Error: File '{filename_str}' exceeds maximum allowed size ({MAX_FILE_SIZE // (1024*1024)}MB)."
-            if 'csv' in content_type_str or filename_str.lower().endswith('.csv'):
                 gaia_logger.info(f"Processing CSV file: {filename_str}")
                 return FileProcessor._process_csv(content, filename_str)
-            elif 'json' in content_type_str or filename_str.lower().endswith('.json'):
                 gaia_logger.info(f"Processing JSON file: {filename_str}")
                 return FileProcessor._process_json(content, filename_str)
             elif 'text/plain' in content_type_str or \
                  ('text/' in content_type_str and not any(sub in content_type_str for sub in ['html', 'xml'])) or \
-                 filename_str.lower().endswith(('.txt', '.md', '.py', '.js', '.c', '.cpp', '.java', '.html', '.xml', '.log')):
                 gaia_logger.info(f"Processing Text-like file: {filename_str} (Content-Type: {content_type_str})")
                 return FileProcessor._process_text(content, filename_str)
             else:
@@ -137,6 +200,14 @@ class FileProcessor:
             gaia_logger.error(f"File processing error for '{filename_str}': {str(e)}", exc_info=True)
             return f"Error processing file '{filename_str}': An unexpected error occurred."
     @staticmethod
     def _process_csv(content: bytes, filename: str) -> str:
         try:
@@ -145,54 +216,35 @@ class FileProcessor:
             for enc in encodings_to_try:
                 try:
                     df = pd.read_csv(io.BytesIO(content), encoding=enc)
-                    gaia_logger.info(f"Successfully read CSV '{filename}' with encoding '{enc}'.")
                     break
-                except UnicodeDecodeError:
-                    continue
-                except Exception as read_e:
-                    gaia_logger.warning(f"Pandas read_csv error for '{filename}' with encoding '{enc}': {read_e}")
-                    continue
-            if df is None:
-                gaia_logger.error(f"Failed to decode CSV '{filename}' with any attempted encoding.")
-                return f"Error: Could not decode CSV file '{filename}'. It might be corrupted or use an unsupported encoding."
             num_rows, num_cols = len(df), len(df.columns)
             cols_str = ', '.join(df.columns)
             sample_str = df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)
             summary = (
                 f"CSV Document Summary: '{filename}' ({num_rows} rows, {num_cols} columns):\n"
-                f"Columns: {cols_str}\n"
-                f"First {min(CSV_SAMPLE_ROWS, num_rows)} sample rows:\n{sample_str}"
             )
-            if len(summary) > MAX_FILE_CONTEXT_LENGTH:
-                summary = summary[:MAX_FILE_CONTEXT_LENGTH - 20] + "\n... (summary truncated)"
-            return summary
         except Exception as e:
-            gaia_logger.error(f"CSV processing error for '{filename}': {str(e)}", exc_info=True)
-            return f"Error processing CSV file '{filename}': {str(e)}"
     @staticmethod
     def _process_json(content: bytes, filename: str) -> str:
         try:
             decoded_content = content.decode('utf-8', errors='replace')
             data = json.loads(decoded_content)
-            compact_json = json.dumps(data, separators=(',', ':'))
-            if len(compact_json) > MAX_FILE_CONTEXT_LENGTH:
-                pretty_truncated = json.dumps(data, indent=2)[:MAX_FILE_CONTEXT_LENGTH - 20] + "\n... (JSON truncated)"
-                return f"JSON Document: '{filename}' (Content partially shown due to size):\n{pretty_truncated}"
             pretty_json = json.dumps(data, indent=2)
-            return f"JSON Document: '{filename}':\n{pretty_json}"
-        except json.JSONDecodeError as e:
-            gaia_logger.error(f"JSON decoding error for '{filename}': {str(e)} - trying to return raw text snippet.")
             text_snippet = content.decode('utf-8', errors='ignore')[:MAX_FILE_CONTEXT_LENGTH - 100]
-            return f"Error: Invalid JSON format in '{filename}'. Content snippet (may be incomplete or unparsable):\n{text_snippet}..."
         except Exception as e:
-            gaia_logger.error(f"JSON processing error for '{filename}': {str(e)}", exc_info=True)
-            return f"Error processing JSON file '{filename}': {str(e)}"
     @staticmethod
     def _process_text(content: bytes, filename: str) -> str:
@@ -202,24 +254,94 @@ class FileProcessor:
             for enc in encodings_to_try:
                 try:
                     text = content.decode(enc)
-                    gaia_logger.info(f"Successfully decoded text file '{filename}' with encoding '{enc}'.")
                     break
-                except UnicodeDecodeError:
-                    continue
-            if text is None:
-                gaia_logger.error(f"Failed to decode text file '{filename}' with any attempted encoding.")
-                text = content.decode('utf-8', errors='ignore')
-                return f"Text Document (decoding issues, some characters may be lost): '{filename}':\n{text[:MAX_FILE_CONTEXT_LENGTH]}..." if len(text) > MAX_FILE_CONTEXT_LENGTH else text
-            summary = f"Text Document: '{filename}':\n{text[:MAX_FILE_CONTEXT_LENGTH]}"
-            if len(text) > MAX_FILE_CONTEXT_LENGTH:
-                summary += "..."
-            return summary
         except Exception as e:
-            gaia_logger.error(f"Text processing error for '{filename}': {str(e)}", exc_info=True)
             return f"Error processing text file '{filename}': {str(e)}"
     @staticmethod
     def _handle_unknown_type(content: bytes, filename: str) -> str:
         gaia_logger.warning(f"Attempting to handle unknown file type for '{filename}' as text snippet.")

     TavilyClient = None
     print("WARNING: tavily-python library not found. Tavily search provider will be unavailable. Install: pip install tavily-python")
+try:
+    from transformers import pipeline as hf_transformers_pipeline
+    import torch
+except ImportError:
+    hf_transformers_pipeline = None
+    torch = None
+    print("WARNING: transformers or torch library not found. Audio processing will be unavailable. Install with: pip install transformers torch")
+try:
+    import librosa
+except ImportError:
+    librosa = None
+    print("WARNING: librosa library not found. Audio processing may be impaired. Install with: pip install librosa")
+try:
+    import openpyxl # Engine for pandas to read .xlsx
+except ImportError:
+    openpyxl = None
+    print("WARNING: openpyxl library not found. .xlsx file processing might fail. Install with: pip install openpyxl")
+try:
+    import pdfplumber
+except ImportError:
+    pdfplumber = None
+    print("WARNING: pdfplumber library not found. PDF file processing will be unavailable. Install with: pip install pdfplumber")
+# --- End of New Imports ---
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 AGENT_DEFAULT_TIMEOUT = 15
 MAX_CONTEXT_LENGTH_LLM = 15000
+MAX_FILE_SIZE = 5 * 1024 * 1024 # 5MB
 CSV_SAMPLE_ROWS = 3
+MAX_FILE_CONTEXT_LENGTH = 7000 # Max characters for file context summary
+# Global variable for ASR pipeline (initialized on first use)
+asr_pipeline_instance: Optional[Any] = None
+ASR_MODEL_NAME = "openai/whisper-tiny" # Smaller model for resource efficiency
 DEFAULT_RAG_CONFIG = {
     'search': {
 }
 class FileProcessor:
+    @staticmethod
+    def _get_asr_pipeline():
+        global asr_pipeline_instance
+        if asr_pipeline_instance is None and hf_transformers_pipeline and torch:
+            try:
+                # device = 0 if torch.cuda.is_available() else -1 # For GPU if available
+                # Simpler for HF Spaces CPU instances:
+                device = -1 # CPU
+                asr_pipeline_instance = hf_transformers_pipeline(
+                    "automatic-speech-recognition",
+                    model=ASR_MODEL_NAME,
+                    device=device
+                )
+                gaia_logger.info(f"ASR pipeline initialized: {ASR_MODEL_NAME} on {'cuda' if device==0 else 'cpu'}.")
+            except Exception as e:
+                gaia_logger.error(f"Failed to initialize ASR pipeline: {e}", exc_info=True)
+                return None
+        return asr_pipeline_instance
     @staticmethod
     def _get_filename_from_url(url_or_header: Optional[str]) -> str:
         if not url_or_header: return "unknown_file"
     @staticmethod
     def process(content: bytes, filename: Optional[str] = "unknown_file", content_type: Optional[str] = "") -> str:
         content_type_str = content_type.lower() if content_type else ""
+        filename_str = filename.lower() if filename else "unknown_file"
         try:
             if len(content) > MAX_FILE_SIZE:
                 gaia_logger.warning(f"File '{filename_str}' exceeds max size {MAX_FILE_SIZE} bytes.")
                 return f"Error: File '{filename_str}' exceeds maximum allowed size ({MAX_FILE_SIZE // (1024*1024)}MB)."
+            if 'csv' in content_type_str or filename_str.endswith('.csv'):
                 gaia_logger.info(f"Processing CSV file: {filename_str}")
                 return FileProcessor._process_csv(content, filename_str)
+            elif 'json' in content_type_str or filename_str.endswith('.json'):
                 gaia_logger.info(f"Processing JSON file: {filename_str}")
                 return FileProcessor._process_json(content, filename_str)
+            elif ('excel' in content_type_str or 'spreadsheetml' in content_type_str or \
+                  filename_str.endswith(('.xlsx', '.xls'))) and openpyxl: # Check for openpyxl
+                gaia_logger.info(f"Processing Excel file: {filename_str}")
+                return FileProcessor._process_excel(content, filename_str)
+            elif ('pdf' in content_type_str or filename_str.endswith('.pdf')) and pdfplumber: # Check for pdfplumber
+                gaia_logger.info(f"Processing PDF file: {filename_str}")
+                return FileProcessor._process_pdf(content, filename_str)
+            elif ('audio' in content_type_str or \
+                  filename_str.endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a'))) and \
+                  hf_transformers_pipeline and librosa: # Check for ASR libs
+                gaia_logger.info(f"Processing Audio file: {filename_str}")
+                return FileProcessor._process_audio(content, filename_str)
             elif 'text/plain' in content_type_str or \
                  ('text/' in content_type_str and not any(sub in content_type_str for sub in ['html', 'xml'])) or \
+                 filename_str.endswith(('.txt', '.md', '.py', '.js', '.c', '.cpp', '.java', '.html', '.xml', '.log')):
                 gaia_logger.info(f"Processing Text-like file: {filename_str} (Content-Type: {content_type_str})")
                 return FileProcessor._process_text(content, filename_str)
             else:
             gaia_logger.error(f"File processing error for '{filename_str}': {str(e)}", exc_info=True)
             return f"Error processing file '{filename_str}': An unexpected error occurred."
+    @staticmethod
+    def _truncate_text(text: str, filename: str, type_name: str) -> str:
+        if len(text) > MAX_FILE_CONTEXT_LENGTH:
+            truncated_text = text[:MAX_FILE_CONTEXT_LENGTH - 25] + "\n... (content truncated)"
+            gaia_logger.info(f"{type_name} '{filename}' content truncated from {len(text)} to {len(truncated_text)} chars.")
+            return truncated_text
+        return text
     @staticmethod
     def _process_csv(content: bytes, filename: str) -> str:
         try:
             for enc in encodings_to_try:
                 try:
                     df = pd.read_csv(io.BytesIO(content), encoding=enc)
                     break
+                except UnicodeDecodeError: continue
+                except Exception: continue
+            if df is None: return f"Error: Could not decode CSV '{filename}'."
             num_rows, num_cols = len(df), len(df.columns)
             cols_str = ', '.join(df.columns)
             sample_str = df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)
             summary = (
                 f"CSV Document Summary: '{filename}' ({num_rows} rows, {num_cols} columns):\n"
+                f"Columns: {cols_str}\nFirst {min(CSV_SAMPLE_ROWS, num_rows)} sample rows:\n{sample_str}"
             )
+            return FileProcessor._truncate_text(summary, filename, "CSV")
         except Exception as e:
+            return f"Error processing CSV '{filename}': {str(e)}"
     @staticmethod
     def _process_json(content: bytes, filename: str) -> str:
         try:
             decoded_content = content.decode('utf-8', errors='replace')
             data = json.loads(decoded_content)
             pretty_json = json.dumps(data, indent=2)
+            summary = f"JSON Document: '{filename}':\n{pretty_json}"
+            return FileProcessor._truncate_text(summary, filename, "JSON")
+        except json.JSONDecodeError:
             text_snippet = content.decode('utf-8', errors='ignore')[:MAX_FILE_CONTEXT_LENGTH - 100]
+            return f"Error: Invalid JSON in '{filename}'. Snippet:\n{text_snippet}..."
         except Exception as e:
+            return f"Error processing JSON '{filename}': {str(e)}"
     @staticmethod
     def _process_text(content: bytes, filename: str) -> str:
             for enc in encodings_to_try:
                 try:
                     text = content.decode(enc)
                     break
+                except UnicodeDecodeError: continue
+            if text is None: text = content.decode('utf-8', errors='ignore')
+            summary = f"Text Document: '{filename}':\n{text}"
+            return FileProcessor._truncate_text(summary, filename, "Text")
         except Exception as e:
             return f"Error processing text file '{filename}': {str(e)}"
+    @staticmethod
+    def _process_excel(content: bytes, filename: str) -> str:
+        if not openpyxl: return f"Error: Excel processing skipped for '{filename}', openpyxl library not available."
+        try:
+            # Reading all sheets and summarizing; can be adjusted for first sheet or specific sheets
+            xls = pd.ExcelFile(io.BytesIO(content), engine='openpyxl')
+            summary_parts = [f"Excel Document Summary: '{filename}'"]
+            for sheet_name in xls.sheet_names:
+                df = xls.parse(sheet_name)
+                num_rows, num_cols = len(df), len(df.columns)
+                cols_str = ', '.join(df.columns)
+                sample_str = df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)
+                sheet_summary = (
+                    f"\n---\nSheet: '{sheet_name}' ({num_rows} rows, {num_cols} columns):\n"
+                    f"Columns: {cols_str}\nFirst {min(CSV_SAMPLE_ROWS, num_rows)} sample rows:\n{sample_str}"
+                )
+                summary_parts.append(sheet_summary)
+                # Check length to avoid overly long summaries from many sheets
+                if sum(len(p) for p in summary_parts) > MAX_FILE_CONTEXT_LENGTH * 0.8: # Soft limit before final truncate
+                    summary_parts.append("\n... (further sheets omitted due to length)")
+                    break
+            full_summary = "".join(summary_parts)
+            return FileProcessor._truncate_text(full_summary, filename, "Excel")
+        except Exception as e:
+            gaia_logger.error(f"Excel processing error for '{filename}': {str(e)}", exc_info=True)
+            return f"Error processing Excel file '{filename}': {str(e)}"
+    @staticmethod
+    def _process_pdf(content: bytes, filename: str) -> str:
+        if not pdfplumber: return f"Error: PDF processing skipped for '{filename}', pdfplumber library not available."
+        text_content = ""
+        try:
+            with io.BytesIO(content) as pdf_buffer:
+                with pdfplumber.open(pdf_buffer) as pdf:
+                    for i, page in enumerate(pdf.pages):
+                        page_text = page.extract_text()
+                        if page_text:
+                            text_content += page_text + "\n"
+                        if len(text_content) > MAX_FILE_CONTEXT_LENGTH * 1.2: # Allow slight overage before hard truncate
+                            gaia_logger.info(f"PDF '{filename}' text extraction stopped early due to length at page {i+1}.")
+                            break
+            if not text_content:
+                return f"PDF Document: '{filename}'. No text could be extracted or PDF is empty."
+            summary = f"PDF Document: '{filename}':\n{text_content}"
+            return FileProcessor._truncate_text(summary, filename, "PDF")
+        except Exception as e:
+            gaia_logger.error(f"PDF processing error for '{filename}': {str(e)}", exc_info=True)
+            return f"Error processing PDF file '{filename}': {str(e)}"
+    @staticmethod
+    def _process_audio(content: bytes, filename: str) -> str:
+        asr_pipeline = FileProcessor._get_asr_pipeline()
+        if not asr_pipeline:
+            return f"Error: Audio processing skipped for '{filename}', ASR pipeline not available."
+        if not librosa:
+            return f"Error: Audio processing skipped for '{filename}', librosa library not available."
+        try:
+            with io.BytesIO(content) as audio_buffer:
+                # Load audio, ensure 16kHz mono for Whisper
+                y, sr = librosa.load(audio_buffer, sr=16000, mono=True)
+            gaia_logger.info(f"Transcribing audio file: {filename} ({len(y)/sr:.2f} seconds)")
+            start_time = time.time()
+            transcription_result = asr_pipeline(y) # Pass numpy array directly
+            end_time = time.time()
+            gaia_logger.info(f"Audio transcription for '{filename}' took {end_time - start_time:.2f} seconds.")
+            transcribed_text = transcription_result.get("text", "") if isinstance(transcription_result, dict) else str(transcription_result)
+            if not transcribed_text.strip():
+                return f"Audio Document: '{filename}'. Transcription result was empty."
+            summary = f"Audio Document (Transcription): '{filename}':\n{transcribed_text}"
+            return FileProcessor._truncate_text(summary, filename, "Audio Transcription")
+        except Exception as e:
+            gaia_logger.error(f"Audio processing/transcription error for '{filename}': {str(e)}", exc_info=True)
+            return f"Error processing Audio file '{filename}': {str(e)}"
     @staticmethod
     def _handle_unknown_type(content: bytes, filename: str) -> str:
         gaia_logger.warning(f"Attempting to handle unknown file type for '{filename}' as text snippet.")

requirements.txt CHANGED Viewed

@@ -7,4 +7,9 @@ google-generativeai
 pandas
 beautifulsoup4
 lxml
-tavily-python

 pandas
 beautifulsoup4
 lxml
+tavily-python
+transformers
+torch
+librosa
+openpyxl
+pdfplumber