Spaces:

Manu-glitz
/

Extraction

Sleeping

App Files Files Community

glitz-dev commited on Dec 23, 2025

Commit

8e51ed8

1 Parent(s): dc3da16

file processing based on url/filepath added

Browse files

Files changed (2) hide show

hipaathesis.py +155 -27
requirements.txt +2 -0

hipaathesis.py CHANGED Viewed

@@ -96,6 +96,8 @@ import getpass
 import tempfile
 import shutil
 import numpy as np
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
@@ -477,43 +479,169 @@ class HIPAACompliantThesisAnalyzer:
         """Calculate secure hash of document content"""
         return hashlib.sha256(content.encode()).hexdigest()
-    def _prepare_document(self, pdf_path):
-        """Common method to prepare document for processing (extract text/images/OCR)"""
-        self.check_session_timeout()
-        # Calculate document hash for audit trail
-        with open(pdf_path, 'rb') as f:
-            doc_content = f.read()
-            doc_hash = hashlib.sha256(doc_content).hexdigest()[:16]
-        self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "DOCUMENT_LOAD")
         try:
-            # Extract text and images
-            text, images = self._extract_text_and_images(pdf_path)
-            self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "TEXT_EXTRACTION")
-            # Perform OCR if enabled
-            ocr_results = []
-            if self.use_ocr and images:
-                ocr_results = self._perform_secure_ocr(images)
-                self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "OCR_PROCESSING")
-            # Analyze images if BLIP enabled
-            image_descriptions = []
-            if self.use_blip and images:
-                image_descriptions = self._analyze_images_securely(images)
-                self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "IMAGE_ANALYSIS")
-            # Combine all text
-            ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
-            combined_text = text + " " + ocr_text
-            return combined_text, images, ocr_results, doc_hash
         except Exception as e:
-            self.hipaa_logger.log_access(self.user_id, "PREPARATION_ERROR", pdf_path, success=False)
             raise e
     def process_document_securely(self, pdf_path, questions, output_file=None):
         """Process document with full HIPAA compliance"""

 import tempfile
 import shutil
 import numpy as np
+import requests
+import urllib3
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
         """Calculate secure hash of document content"""
         return hashlib.sha256(content.encode()).hexdigest()
+    def _is_url(self, path):
+        """Check if the provided path is a URL"""
+        url_patterns = ['http://', 'https://', 'ftp://', 'ftps://']
+        return any(path.strip().lower().startswith(pattern) for pattern in url_patterns)
+    def _extract_from_url(self, url, verify_ssl=None):
+        """Extract content from URL - download PDF temporarily and process
+        Args:
+            url: URL to download PDF from
+            verify_ssl: Whether to verify SSL certificates. If None, automatically
+                       disables verification for localhost URLs
+        """
+        import requests
+        import urllib3
+        from urllib.parse import urlparse
         try:
+            # Determine SSL verification setting
+            parsed_url = urlparse(url)
+            hostname = parsed_url.hostname or ''
+            # Auto-disable SSL verification for localhost
+            if verify_ssl is None:
+                if hostname.lower() in ['localhost', '127.0.0.1', '::1']:
+                    verify_ssl = False
+                    print(f"Note: SSL verification disabled for localhost URL")
+                    # Suppress only the InsecureRequestWarning for localhost
+                    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+                else:
+                    verify_ssl = True
+            # Download the file from URL
+            print(f"Downloading document from URL: {url}")
+            response = requests.get(url, timeout=30, stream=True, verify=verify_ssl)
+            response.raise_for_status()
+            # Check if content type is PDF
+            content_type = response.headers.get('content-type', '').lower()
+            if 'pdf' not in content_type and not url.lower().endswith('.pdf'):
+                print(f"Warning: Content type is {content_type}, might not be a PDF")
+            # Create a temporary file to store the downloaded PDF
+            temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
+            temp_pdf_path = temp_pdf.name
+            # Write content to temporary file
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    temp_pdf.write(chunk)
+            temp_pdf.close()
+            print(f"Downloaded successfully to temporary file: {temp_pdf_path}")
+            # Calculate document hash for audit trail
+            with open(temp_pdf_path, 'rb') as f:
+                doc_content = f.read()
+                doc_hash = hashlib.sha256(doc_content).hexdigest()[:16]
+            # Extract text and images from the downloaded file
+            text, images = self._extract_text_and_images(temp_pdf_path)
+            # Clean up temporary file after extraction
+            try:
+                os.unlink(temp_pdf_path)
+                print("Temporary file cleaned up")
+            except Exception as e:
+                print(f"Warning: Could not delete temporary file: {e}")
+            return text, images, doc_hash
+        except requests.exceptions.SSLError as e:
+            # Provide helpful error message for SSL errors
+            error_msg = f"SSL certificate verification failed: {e}\n"
+            error_msg += "For self-signed certificates, the verification is automatically disabled for localhost.\n"
+            error_msg += "If you're using a self-signed certificate on a non-localhost domain, "
+            error_msg += "consider using a trusted certificate or contact your administrator."
+            raise Exception(error_msg)
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"Failed to download from URL: {e}")
         except Exception as e:
+            # Clean up temp file if it exists
+            if 'temp_pdf_path' in locals() and os.path.exists(temp_pdf_path):
+                try:
+                    os.unlink(temp_pdf_path)
+                except:
+                    pass
             raise e
+    def _prepare_document(self, pdf_path):
+        """Common method to prepare document for processing (extract text/images/OCR)
+        Supports both file paths and URLs"""
+        self.check_session_timeout()
+        # Dynamically identify if input is URL or file path
+        if self._is_url(pdf_path):
+            # URL processing
+            print(f"Detected URL input: {pdf_path}")
+            self.hipaa_logger.log_phi_processing(self.user_id, "URL", "URL_DOWNLOAD_START")
+            try:
+                # Extract from URL
+                text, images, doc_hash = self._extract_from_url(pdf_path)
+                self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "URL_EXTRACTION")
+                # Perform OCR if enabled
+                ocr_results = []
+                if self.use_ocr and images:
+                    ocr_results = self._perform_secure_ocr(images)
+                    self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "OCR_PROCESSING")
+                # Analyze images if BLIP enabled
+                image_descriptions = []
+                if self.use_blip and images:
+                    image_descriptions = self._analyze_images_securely(images)
+                    self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "IMAGE_ANALYSIS")
+                # Combine all text
+                ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
+                combined_text = text + " " + ocr_text
+                return combined_text, images, ocr_results, doc_hash
+            except Exception as e:
+                self.hipaa_logger.log_access(self.user_id, "URL_PREPARATION_ERROR", pdf_path, success=False)
+                raise e
+        else:
+            # File path processing (existing logic)
+            print(f"Detected file path input: {pdf_path}")
+            # Calculate document hash for audit trail
+            with open(pdf_path, 'rb') as f:
+                doc_content = f.read()
+                doc_hash = hashlib.sha256(doc_content).hexdigest()[:16]
+            self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "DOCUMENT_LOAD")
+            try:
+                # Extract text and images
+                text, images = self._extract_text_and_images(pdf_path)
+                self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "TEXT_EXTRACTION")
+                # Perform OCR if enabled
+                ocr_results = []
+                if self.use_ocr and images:
+                    ocr_results = self._perform_secure_ocr(images)
+                    self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "OCR_PROCESSING")
+                # Analyze images if BLIP enabled
+                image_descriptions = []
+                if self.use_blip and images:
+                    image_descriptions = self._analyze_images_securely(images)
+                    self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "IMAGE_ANALYSIS")
+                # Combine all text
+                ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
+                combined_text = text + " " + ocr_text
+                return combined_text, images, ocr_results, doc_hash
+            except Exception as e:
+                self.hipaa_logger.log_access(self.user_id, "PREPARATION_ERROR", pdf_path, success=False)
+                raise e
     def process_document_securely(self, pdf_path, questions, output_file=None):
         """Process document with full HIPAA compliance"""

requirements.txt CHANGED Viewed

@@ -8,6 +8,8 @@ Pillow==11.3.0
 pydantic==2.11.9
 PyPDF2==3.0.1
 pytesseract==0.3.13
 torch==2.8.0
 transformers==4.56.1
 uvicorn

 pydantic==2.11.9
 PyPDF2==3.0.1
 pytesseract==0.3.13
+requests==2.31.0
 torch==2.8.0
 transformers==4.56.1
+urllib3==2.2.0
 uvicorn