glitz-dev commited on
Commit
8e51ed8
·
1 Parent(s): dc3da16

file processing based on url/filepath added

Browse files
Files changed (2) hide show
  1. hipaathesis.py +155 -27
  2. requirements.txt +2 -0
hipaathesis.py CHANGED
@@ -96,6 +96,8 @@ import getpass
96
  import tempfile
97
  import shutil
98
  import numpy as np
 
 
99
  from fastapi import FastAPI
100
  from fastapi.staticfiles import StaticFiles
101
  from pydantic import BaseModel
@@ -477,43 +479,169 @@ class HIPAACompliantThesisAnalyzer:
477
  """Calculate secure hash of document content"""
478
  return hashlib.sha256(content.encode()).hexdigest()
479
 
480
- def _prepare_document(self, pdf_path):
481
- """Common method to prepare document for processing (extract text/images/OCR)"""
482
- self.check_session_timeout()
483
-
484
- # Calculate document hash for audit trail
485
- with open(pdf_path, 'rb') as f:
486
- doc_content = f.read()
487
- doc_hash = hashlib.sha256(doc_content).hexdigest()[:16]
488
 
489
- self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "DOCUMENT_LOAD")
 
 
 
 
 
 
 
490
 
491
  try:
492
- # Extract text and images
493
- text, images = self._extract_text_and_images(pdf_path)
494
- self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "TEXT_EXTRACTION")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
- # Perform OCR if enabled
497
- ocr_results = []
498
- if self.use_ocr and images:
499
- ocr_results = self._perform_secure_ocr(images)
500
- self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "OCR_PROCESSING")
501
 
502
- # Analyze images if BLIP enabled
503
- image_descriptions = []
504
- if self.use_blip and images:
505
- image_descriptions = self._analyze_images_securely(images)
506
- self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "IMAGE_ANALYSIS")
507
 
508
- # Combine all text
509
- ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
510
- combined_text = text + " " + ocr_text
 
511
 
512
- return combined_text, images, ocr_results, doc_hash
 
513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  except Exception as e:
515
- self.hipaa_logger.log_access(self.user_id, "PREPARATION_ERROR", pdf_path, success=False)
 
 
 
 
 
516
  raise e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
 
518
  def process_document_securely(self, pdf_path, questions, output_file=None):
519
  """Process document with full HIPAA compliance"""
 
96
  import tempfile
97
  import shutil
98
  import numpy as np
99
+ import requests
100
+ import urllib3
101
  from fastapi import FastAPI
102
  from fastapi.staticfiles import StaticFiles
103
  from pydantic import BaseModel
 
479
  """Calculate secure hash of document content"""
480
  return hashlib.sha256(content.encode()).hexdigest()
481
 
482
+ def _is_url(self, path):
483
+ """Check if the provided path is a URL"""
484
+ url_patterns = ['http://', 'https://', 'ftp://', 'ftps://']
485
+ return any(path.strip().lower().startswith(pattern) for pattern in url_patterns)
486
+
487
+ def _extract_from_url(self, url, verify_ssl=None):
488
+ """Extract content from URL - download PDF temporarily and process
 
489
 
490
+ Args:
491
+ url: URL to download PDF from
492
+ verify_ssl: Whether to verify SSL certificates. If None, automatically
493
+ disables verification for localhost URLs
494
+ """
495
+ import requests
496
+ import urllib3
497
+ from urllib.parse import urlparse
498
 
499
  try:
500
+ # Determine SSL verification setting
501
+ parsed_url = urlparse(url)
502
+ hostname = parsed_url.hostname or ''
503
+
504
+ # Auto-disable SSL verification for localhost
505
+ if verify_ssl is None:
506
+ if hostname.lower() in ['localhost', '127.0.0.1', '::1']:
507
+ verify_ssl = False
508
+ print(f"Note: SSL verification disabled for localhost URL")
509
+ # Suppress only the InsecureRequestWarning for localhost
510
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
511
+ else:
512
+ verify_ssl = True
513
+
514
+ # Download the file from URL
515
+ print(f"Downloading document from URL: {url}")
516
+ response = requests.get(url, timeout=30, stream=True, verify=verify_ssl)
517
+ response.raise_for_status()
518
+
519
+ # Check if content type is PDF
520
+ content_type = response.headers.get('content-type', '').lower()
521
+ if 'pdf' not in content_type and not url.lower().endswith('.pdf'):
522
+ print(f"Warning: Content type is {content_type}, might not be a PDF")
523
+
524
+ # Create a temporary file to store the downloaded PDF
525
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
526
+ temp_pdf_path = temp_pdf.name
527
 
528
+ # Write content to temporary file
529
+ for chunk in response.iter_content(chunk_size=8192):
530
+ if chunk:
531
+ temp_pdf.write(chunk)
532
+ temp_pdf.close()
533
 
534
+ print(f"Downloaded successfully to temporary file: {temp_pdf_path}")
 
 
 
 
535
 
536
+ # Calculate document hash for audit trail
537
+ with open(temp_pdf_path, 'rb') as f:
538
+ doc_content = f.read()
539
+ doc_hash = hashlib.sha256(doc_content).hexdigest()[:16]
540
 
541
+ # Extract text and images from the downloaded file
542
+ text, images = self._extract_text_and_images(temp_pdf_path)
543
 
544
+ # Clean up temporary file after extraction
545
+ try:
546
+ os.unlink(temp_pdf_path)
547
+ print("Temporary file cleaned up")
548
+ except Exception as e:
549
+ print(f"Warning: Could not delete temporary file: {e}")
550
+
551
+ return text, images, doc_hash
552
+
553
+ except requests.exceptions.SSLError as e:
554
+ # Provide helpful error message for SSL errors
555
+ error_msg = f"SSL certificate verification failed: {e}\n"
556
+ error_msg += "For self-signed certificates, the verification is automatically disabled for localhost.\n"
557
+ error_msg += "If you're using a self-signed certificate on a non-localhost domain, "
558
+ error_msg += "consider using a trusted certificate or contact your administrator."
559
+ raise Exception(error_msg)
560
+ except requests.exceptions.RequestException as e:
561
+ raise Exception(f"Failed to download from URL: {e}")
562
  except Exception as e:
563
+ # Clean up temp file if it exists
564
+ if 'temp_pdf_path' in locals() and os.path.exists(temp_pdf_path):
565
+ try:
566
+ os.unlink(temp_pdf_path)
567
+ except:
568
+ pass
569
  raise e
570
+
571
+ def _prepare_document(self, pdf_path):
572
+ """Common method to prepare document for processing (extract text/images/OCR)
573
+ Supports both file paths and URLs"""
574
+ self.check_session_timeout()
575
+
576
+ # Dynamically identify if input is URL or file path
577
+ if self._is_url(pdf_path):
578
+ # URL processing
579
+ print(f"Detected URL input: {pdf_path}")
580
+ self.hipaa_logger.log_phi_processing(self.user_id, "URL", "URL_DOWNLOAD_START")
581
+
582
+ try:
583
+ # Extract from URL
584
+ text, images, doc_hash = self._extract_from_url(pdf_path)
585
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "URL_EXTRACTION")
586
+
587
+ # Perform OCR if enabled
588
+ ocr_results = []
589
+ if self.use_ocr and images:
590
+ ocr_results = self._perform_secure_ocr(images)
591
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "OCR_PROCESSING")
592
+
593
+ # Analyze images if BLIP enabled
594
+ image_descriptions = []
595
+ if self.use_blip and images:
596
+ image_descriptions = self._analyze_images_securely(images)
597
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "IMAGE_ANALYSIS")
598
+
599
+ # Combine all text
600
+ ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
601
+ combined_text = text + " " + ocr_text
602
+
603
+ return combined_text, images, ocr_results, doc_hash
604
+
605
+ except Exception as e:
606
+ self.hipaa_logger.log_access(self.user_id, "URL_PREPARATION_ERROR", pdf_path, success=False)
607
+ raise e
608
+ else:
609
+ # File path processing (existing logic)
610
+ print(f"Detected file path input: {pdf_path}")
611
+
612
+ # Calculate document hash for audit trail
613
+ with open(pdf_path, 'rb') as f:
614
+ doc_content = f.read()
615
+ doc_hash = hashlib.sha256(doc_content).hexdigest()[:16]
616
+
617
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "DOCUMENT_LOAD")
618
+
619
+ try:
620
+ # Extract text and images
621
+ text, images = self._extract_text_and_images(pdf_path)
622
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "TEXT_EXTRACTION")
623
+
624
+ # Perform OCR if enabled
625
+ ocr_results = []
626
+ if self.use_ocr and images:
627
+ ocr_results = self._perform_secure_ocr(images)
628
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "OCR_PROCESSING")
629
+
630
+ # Analyze images if BLIP enabled
631
+ image_descriptions = []
632
+ if self.use_blip and images:
633
+ image_descriptions = self._analyze_images_securely(images)
634
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "IMAGE_ANALYSIS")
635
+
636
+ # Combine all text
637
+ ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
638
+ combined_text = text + " " + ocr_text
639
+
640
+ return combined_text, images, ocr_results, doc_hash
641
+
642
+ except Exception as e:
643
+ self.hipaa_logger.log_access(self.user_id, "PREPARATION_ERROR", pdf_path, success=False)
644
+ raise e
645
 
646
  def process_document_securely(self, pdf_path, questions, output_file=None):
647
  """Process document with full HIPAA compliance"""
requirements.txt CHANGED
@@ -8,6 +8,8 @@ Pillow==11.3.0
8
  pydantic==2.11.9
9
  PyPDF2==3.0.1
10
  pytesseract==0.3.13
 
11
  torch==2.8.0
12
  transformers==4.56.1
 
13
  uvicorn
 
8
  pydantic==2.11.9
9
  PyPDF2==3.0.1
10
  pytesseract==0.3.13
11
+ requests==2.31.0
12
  torch==2.8.0
13
  transformers==4.56.1
14
+ urllib3==2.2.0
15
  uvicorn