Spaces:

JatinAutonomousLabs
/

PDF_analyst

Paused

JatsTheAIGen commited on Oct 19, 2025

Commit

5134f75

1 Parent(s): f36fcc8

Improve caching system: Enable document-level caching for better UX

- Added document-level caching that works across different prompts
- Users can now analyze the same document with multiple prompts efficiently
- Cached document content speeds up subsequent analyses with new prompts
- Enhanced user feedback to show when cached content is being used
- Maintains exact prompt+document caching for instant responses
- Better UX for users exploring the same document with different questions

Files changed (3) hide show

agents.py +14 -6
app.py +13 -4
utils/__init__.py +34 -1

agents.py CHANGED Viewed

@@ -5,7 +5,7 @@ import logging
 from typing import Optional, Dict, Any, List, AsyncGenerator
 import time
-from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata, get_cached_analysis, cache_analysis
 from utils.visual_output import VisualOutputGenerator
 from config import Config
@@ -41,19 +41,27 @@ class AnalysisAgent(BaseAgent):
     async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
         start_time = time.time()
-        # Check cache first
         if file_path:
             cached_result = get_cached_analysis(file_path, prompt)
             if cached_result:
-                logger.info(f"Returning cached analysis for {file_path}")
                 return cached_result
         if file_path:
             # Get document metadata
             metadata = get_document_metadata(file_path)
-            # Load text with caching
-            text = load_pdf_text_cached(file_path)
             # Check if document needs chunking
             if len(text) > Config.CHUNK_SIZE:
@@ -66,7 +74,7 @@ class AnalysisAgent(BaseAgent):
             metadata = {}
             result = await self._process_content(prompt, content, metadata, "")
-        # Cache the result
         if file_path:
             cache_analysis(file_path, prompt, result)

 from typing import Optional, Dict, Any, List, AsyncGenerator
 import time
+from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata, get_cached_analysis, cache_analysis, get_cached_document_content, cache_document_content
 from utils.visual_output import VisualOutputGenerator
 from config import Config
     async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
         start_time = time.time()
+        # Check cache first - exact prompt match
         if file_path:
             cached_result = get_cached_analysis(file_path, prompt)
             if cached_result:
+                logger.info(f"Returning cached analysis for {file_path} with exact prompt match")
                 return cached_result
         if file_path:
             # Get document metadata
             metadata = get_document_metadata(file_path)
+            # Check for cached document content (any prompt)
+            cached_content = get_cached_document_content(file_path)
+            if cached_content:
+                logger.info(f"Using cached document content for {file_path}")
+                text = cached_content
+            else:
+                # Load and cache text
+                text = load_pdf_text_cached(file_path)
+                cache_document_content(file_path, text)
+                logger.info(f"Cached document content for {file_path}")
             # Check if document needs chunking
             if len(text) > Config.CHUNK_SIZE:
             metadata = {}
             result = await self._process_content(prompt, content, metadata, "")
+        # Cache the analysis result
         if file_path:
             cache_analysis(file_path, prompt, result)

app.py CHANGED Viewed

@@ -90,15 +90,20 @@ def handle_analysis(file, prompt, username="anonymous", use_streaming=False):
         path = save_uploaded_file(file, username)
         # Check if this is a cached result
-        from utils import get_cached_analysis
         cached_result = get_cached_analysis(path, prompt)
         if cached_result:
-            status = "⚡ **Cached Result** - Instant response from previous analysis"
             result = cached_result.get("analysis", "No analysis result.")
             metadata = cached_result.get("metadata", {})
         else:
-            status = "🔄 **Processing** - Analyzing document with AI..."
             result = run_async(
                 ORCHESTRATOR.handle_user_prompt,
                 user_id=username,
@@ -108,7 +113,11 @@ def handle_analysis(file, prompt, username="anonymous", use_streaming=False):
             )
             result = result.get("analysis", "No analysis result.")
             metadata = result.get("metadata", {}) if isinstance(result, dict) else {}
-            status = "✅ **Analysis Complete** - Fresh analysis generated"
         return result, status, metadata
     except Exception as e:

         path = save_uploaded_file(file, username)
         # Check if this is a cached result
+        from utils import get_cached_analysis, get_cached_document_content
         cached_result = get_cached_analysis(path, prompt)
+        cached_content = get_cached_document_content(path)
         if cached_result:
+            status = "⚡ **Cached Analysis** - Instant response from previous analysis"
             result = cached_result.get("analysis", "No analysis result.")
             metadata = cached_result.get("metadata", {})
         else:
+            if cached_content:
+                status = "🔄 **Processing** - Using cached document, analyzing with new prompt..."
+            else:
+                status = "🔄 **Processing** - Analyzing document with AI..."
             result = run_async(
                 ORCHESTRATOR.handle_user_prompt,
                 user_id=username,
             )
             result = result.get("analysis", "No analysis result.")
             metadata = result.get("metadata", {}) if isinstance(result, dict) else {}
+            if cached_content:
+                status = "✅ **Analysis Complete** - Fresh analysis using cached document"
+            else:
+                status = "✅ **Analysis Complete** - Fresh analysis generated and cached"
         return result, status, metadata
     except Exception as e:

utils/__init__.py CHANGED Viewed

@@ -232,7 +232,7 @@ CACHE_DIR = Path(tempfile.gettempdir()) / "pdf_analysis_cache"
 CACHE_DIR.mkdir(exist_ok=True)
 def get_cached_analysis(file_path: str, prompt: str) -> Optional[Dict[str, Any]]:
-    """Retrieve cached analysis if available"""
     file_hash = get_file_hash(file_path)
     prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
     cache_file = CACHE_DIR / f"{file_hash}_{prompt_hash}.json"
@@ -250,6 +250,23 @@ def get_cached_analysis(file_path: str, prompt: str) -> Optional[Dict[str, Any]]
             pass
     return None
 def cache_analysis(file_path: str, prompt: str, analysis: Dict[str, Any]) -> None:
     """Cache analysis results for future use"""
     file_hash = get_file_hash(file_path)
@@ -268,6 +285,22 @@ def cache_analysis(file_path: str, prompt: str, analysis: Dict[str, Any]) -> Non
     except Exception:
         pass  # Fail silently if caching fails
 def get_cached_text(file_path: str) -> Optional[str]:
     """Retrieve cached PDF text if available"""
     file_hash = get_file_hash(file_path)

 CACHE_DIR.mkdir(exist_ok=True)
 def get_cached_analysis(file_path: str, prompt: str) -> Optional[Dict[str, Any]]:
+    """Retrieve cached analysis if available - exact prompt match"""
     file_hash = get_file_hash(file_path)
     prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
     cache_file = CACHE_DIR / f"{file_hash}_{prompt_hash}.json"
             pass
     return None
+def get_cached_document_content(file_path: str) -> Optional[str]:
+    """Retrieve cached document content for any prompt - document-only match"""
+    file_hash = get_file_hash(file_path)
+    cache_file = CACHE_DIR / f"{file_hash}_content.json"
+    if cache_file.exists():
+        try:
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                cache_data = json.load(f)
+                # Check if file hasn't been modified and cache is still valid (24 hours)
+                if (cache_data.get('file_hash') == file_hash and
+                    time.time() - cache_data.get('cached_at', 0) < 86400):  # 24 hours
+                    return cache_data.get('content')
+        except Exception:
+            pass
+    return None
 def cache_analysis(file_path: str, prompt: str, analysis: Dict[str, Any]) -> None:
     """Cache analysis results for future use"""
     file_hash = get_file_hash(file_path)
     except Exception:
         pass  # Fail silently if caching fails
+def cache_document_content(file_path: str, content: str) -> None:
+    """Cache document content for reuse with any prompt"""
+    file_hash = get_file_hash(file_path)
+    cache_file = CACHE_DIR / f"{file_hash}_content.json"
+    try:
+        cache_data = {
+            'file_hash': file_hash,
+            'content': content,
+            'cached_at': time.time()
+        }
+        with open(cache_file, 'w', encoding='utf-8') as f:
+            json.dump(cache_data, f, ensure_ascii=False)
+    except Exception:
+        pass  # Fail silently if caching fails
 def get_cached_text(file_path: str) -> Optional[str]:
     """Retrieve cached PDF text if available"""
     file_hash = get_file_hash(file_path)