Spaces:
Running
Running
glitz-dev commited on
Commit ·
8e51ed8
1
Parent(s): dc3da16
file processing based on url/filepath added
Browse files- hipaathesis.py +155 -27
- requirements.txt +2 -0
hipaathesis.py
CHANGED
|
@@ -96,6 +96,8 @@ import getpass
|
|
| 96 |
import tempfile
|
| 97 |
import shutil
|
| 98 |
import numpy as np
|
|
|
|
|
|
|
| 99 |
from fastapi import FastAPI
|
| 100 |
from fastapi.staticfiles import StaticFiles
|
| 101 |
from pydantic import BaseModel
|
|
@@ -477,43 +479,169 @@ class HIPAACompliantThesisAnalyzer:
|
|
| 477 |
"""Calculate secure hash of document content"""
|
| 478 |
return hashlib.sha256(content.encode()).hexdigest()
|
| 479 |
|
| 480 |
-
def
|
| 481 |
-
"""
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
doc_hash = hashlib.sha256(doc_content).hexdigest()[:16]
|
| 488 |
|
| 489 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
try:
|
| 492 |
-
#
|
| 493 |
-
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
-
#
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
|
| 502 |
-
|
| 503 |
-
image_descriptions = []
|
| 504 |
-
if self.use_blip and images:
|
| 505 |
-
image_descriptions = self._analyze_images_securely(images)
|
| 506 |
-
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "IMAGE_ANALYSIS")
|
| 507 |
|
| 508 |
-
#
|
| 509 |
-
|
| 510 |
-
|
|
|
|
| 511 |
|
| 512 |
-
|
|
|
|
| 513 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
except Exception as e:
|
| 515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
|
| 518 |
def process_document_securely(self, pdf_path, questions, output_file=None):
|
| 519 |
"""Process document with full HIPAA compliance"""
|
|
|
|
| 96 |
import tempfile
|
| 97 |
import shutil
|
| 98 |
import numpy as np
|
| 99 |
+
import requests
|
| 100 |
+
import urllib3
|
| 101 |
from fastapi import FastAPI
|
| 102 |
from fastapi.staticfiles import StaticFiles
|
| 103 |
from pydantic import BaseModel
|
|
|
|
| 479 |
"""Calculate secure hash of document content"""
|
| 480 |
return hashlib.sha256(content.encode()).hexdigest()
|
| 481 |
|
| 482 |
+
def _is_url(self, path):
|
| 483 |
+
"""Check if the provided path is a URL"""
|
| 484 |
+
url_patterns = ['http://', 'https://', 'ftp://', 'ftps://']
|
| 485 |
+
return any(path.strip().lower().startswith(pattern) for pattern in url_patterns)
|
| 486 |
+
|
| 487 |
+
def _extract_from_url(self, url, verify_ssl=None):
|
| 488 |
+
"""Extract content from URL - download PDF temporarily and process
|
|
|
|
| 489 |
|
| 490 |
+
Args:
|
| 491 |
+
url: URL to download PDF from
|
| 492 |
+
verify_ssl: Whether to verify SSL certificates. If None, automatically
|
| 493 |
+
disables verification for localhost URLs
|
| 494 |
+
"""
|
| 495 |
+
import requests
|
| 496 |
+
import urllib3
|
| 497 |
+
from urllib.parse import urlparse
|
| 498 |
|
| 499 |
try:
|
| 500 |
+
# Determine SSL verification setting
|
| 501 |
+
parsed_url = urlparse(url)
|
| 502 |
+
hostname = parsed_url.hostname or ''
|
| 503 |
+
|
| 504 |
+
# Auto-disable SSL verification for localhost
|
| 505 |
+
if verify_ssl is None:
|
| 506 |
+
if hostname.lower() in ['localhost', '127.0.0.1', '::1']:
|
| 507 |
+
verify_ssl = False
|
| 508 |
+
print(f"Note: SSL verification disabled for localhost URL")
|
| 509 |
+
# Suppress only the InsecureRequestWarning for localhost
|
| 510 |
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
| 511 |
+
else:
|
| 512 |
+
verify_ssl = True
|
| 513 |
+
|
| 514 |
+
# Download the file from URL
|
| 515 |
+
print(f"Downloading document from URL: {url}")
|
| 516 |
+
response = requests.get(url, timeout=30, stream=True, verify=verify_ssl)
|
| 517 |
+
response.raise_for_status()
|
| 518 |
+
|
| 519 |
+
# Check if content type is PDF
|
| 520 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 521 |
+
if 'pdf' not in content_type and not url.lower().endswith('.pdf'):
|
| 522 |
+
print(f"Warning: Content type is {content_type}, might not be a PDF")
|
| 523 |
+
|
| 524 |
+
# Create a temporary file to store the downloaded PDF
|
| 525 |
+
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
|
| 526 |
+
temp_pdf_path = temp_pdf.name
|
| 527 |
|
| 528 |
+
# Write content to temporary file
|
| 529 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 530 |
+
if chunk:
|
| 531 |
+
temp_pdf.write(chunk)
|
| 532 |
+
temp_pdf.close()
|
| 533 |
|
| 534 |
+
print(f"Downloaded successfully to temporary file: {temp_pdf_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
|
| 536 |
+
# Calculate document hash for audit trail
|
| 537 |
+
with open(temp_pdf_path, 'rb') as f:
|
| 538 |
+
doc_content = f.read()
|
| 539 |
+
doc_hash = hashlib.sha256(doc_content).hexdigest()[:16]
|
| 540 |
|
| 541 |
+
# Extract text and images from the downloaded file
|
| 542 |
+
text, images = self._extract_text_and_images(temp_pdf_path)
|
| 543 |
|
| 544 |
+
# Clean up temporary file after extraction
|
| 545 |
+
try:
|
| 546 |
+
os.unlink(temp_pdf_path)
|
| 547 |
+
print("Temporary file cleaned up")
|
| 548 |
+
except Exception as e:
|
| 549 |
+
print(f"Warning: Could not delete temporary file: {e}")
|
| 550 |
+
|
| 551 |
+
return text, images, doc_hash
|
| 552 |
+
|
| 553 |
+
except requests.exceptions.SSLError as e:
|
| 554 |
+
# Provide helpful error message for SSL errors
|
| 555 |
+
error_msg = f"SSL certificate verification failed: {e}\n"
|
| 556 |
+
error_msg += "For self-signed certificates, the verification is automatically disabled for localhost.\n"
|
| 557 |
+
error_msg += "If you're using a self-signed certificate on a non-localhost domain, "
|
| 558 |
+
error_msg += "consider using a trusted certificate or contact your administrator."
|
| 559 |
+
raise Exception(error_msg)
|
| 560 |
+
except requests.exceptions.RequestException as e:
|
| 561 |
+
raise Exception(f"Failed to download from URL: {e}")
|
| 562 |
except Exception as e:
|
| 563 |
+
# Clean up temp file if it exists
|
| 564 |
+
if 'temp_pdf_path' in locals() and os.path.exists(temp_pdf_path):
|
| 565 |
+
try:
|
| 566 |
+
os.unlink(temp_pdf_path)
|
| 567 |
+
except:
|
| 568 |
+
pass
|
| 569 |
raise e
|
| 570 |
+
|
| 571 |
+
def _prepare_document(self, pdf_path):
|
| 572 |
+
"""Common method to prepare document for processing (extract text/images/OCR)
|
| 573 |
+
Supports both file paths and URLs"""
|
| 574 |
+
self.check_session_timeout()
|
| 575 |
+
|
| 576 |
+
# Dynamically identify if input is URL or file path
|
| 577 |
+
if self._is_url(pdf_path):
|
| 578 |
+
# URL processing
|
| 579 |
+
print(f"Detected URL input: {pdf_path}")
|
| 580 |
+
self.hipaa_logger.log_phi_processing(self.user_id, "URL", "URL_DOWNLOAD_START")
|
| 581 |
+
|
| 582 |
+
try:
|
| 583 |
+
# Extract from URL
|
| 584 |
+
text, images, doc_hash = self._extract_from_url(pdf_path)
|
| 585 |
+
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "URL_EXTRACTION")
|
| 586 |
+
|
| 587 |
+
# Perform OCR if enabled
|
| 588 |
+
ocr_results = []
|
| 589 |
+
if self.use_ocr and images:
|
| 590 |
+
ocr_results = self._perform_secure_ocr(images)
|
| 591 |
+
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "OCR_PROCESSING")
|
| 592 |
+
|
| 593 |
+
# Analyze images if BLIP enabled
|
| 594 |
+
image_descriptions = []
|
| 595 |
+
if self.use_blip and images:
|
| 596 |
+
image_descriptions = self._analyze_images_securely(images)
|
| 597 |
+
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "IMAGE_ANALYSIS")
|
| 598 |
+
|
| 599 |
+
# Combine all text
|
| 600 |
+
ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
|
| 601 |
+
combined_text = text + " " + ocr_text
|
| 602 |
+
|
| 603 |
+
return combined_text, images, ocr_results, doc_hash
|
| 604 |
+
|
| 605 |
+
except Exception as e:
|
| 606 |
+
self.hipaa_logger.log_access(self.user_id, "URL_PREPARATION_ERROR", pdf_path, success=False)
|
| 607 |
+
raise e
|
| 608 |
+
else:
|
| 609 |
+
# File path processing (existing logic)
|
| 610 |
+
print(f"Detected file path input: {pdf_path}")
|
| 611 |
+
|
| 612 |
+
# Calculate document hash for audit trail
|
| 613 |
+
with open(pdf_path, 'rb') as f:
|
| 614 |
+
doc_content = f.read()
|
| 615 |
+
doc_hash = hashlib.sha256(doc_content).hexdigest()[:16]
|
| 616 |
+
|
| 617 |
+
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "DOCUMENT_LOAD")
|
| 618 |
+
|
| 619 |
+
try:
|
| 620 |
+
# Extract text and images
|
| 621 |
+
text, images = self._extract_text_and_images(pdf_path)
|
| 622 |
+
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "TEXT_EXTRACTION")
|
| 623 |
+
|
| 624 |
+
# Perform OCR if enabled
|
| 625 |
+
ocr_results = []
|
| 626 |
+
if self.use_ocr and images:
|
| 627 |
+
ocr_results = self._perform_secure_ocr(images)
|
| 628 |
+
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "OCR_PROCESSING")
|
| 629 |
+
|
| 630 |
+
# Analyze images if BLIP enabled
|
| 631 |
+
image_descriptions = []
|
| 632 |
+
if self.use_blip and images:
|
| 633 |
+
image_descriptions = self._analyze_images_securely(images)
|
| 634 |
+
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "IMAGE_ANALYSIS")
|
| 635 |
+
|
| 636 |
+
# Combine all text
|
| 637 |
+
ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
|
| 638 |
+
combined_text = text + " " + ocr_text
|
| 639 |
+
|
| 640 |
+
return combined_text, images, ocr_results, doc_hash
|
| 641 |
+
|
| 642 |
+
except Exception as e:
|
| 643 |
+
self.hipaa_logger.log_access(self.user_id, "PREPARATION_ERROR", pdf_path, success=False)
|
| 644 |
+
raise e
|
| 645 |
|
| 646 |
def process_document_securely(self, pdf_path, questions, output_file=None):
|
| 647 |
"""Process document with full HIPAA compliance"""
|
requirements.txt
CHANGED
|
@@ -8,6 +8,8 @@ Pillow==11.3.0
|
|
| 8 |
pydantic==2.11.9
|
| 9 |
PyPDF2==3.0.1
|
| 10 |
pytesseract==0.3.13
|
|
|
|
| 11 |
torch==2.8.0
|
| 12 |
transformers==4.56.1
|
|
|
|
| 13 |
uvicorn
|
|
|
|
| 8 |
pydantic==2.11.9
|
| 9 |
PyPDF2==3.0.1
|
| 10 |
pytesseract==0.3.13
|
| 11 |
+
requests==2.31.0
|
| 12 |
torch==2.8.0
|
| 13 |
transformers==4.56.1
|
| 14 |
+
urllib3==2.2.0
|
| 15 |
uvicorn
|