Spaces:
Sleeping
Sleeping
glitz-dev
commited on
Commit
·
1fd1e18
1
Parent(s):
8e51ed8
extract conent & save data from app itself or return the response added
Browse files- hipaathesis.py +463 -1
- requirements.txt +6 -0
hipaathesis.py
CHANGED
|
@@ -98,8 +98,14 @@ import shutil
|
|
| 98 |
import numpy as np
|
| 99 |
import requests
|
| 100 |
import urllib3
|
| 101 |
-
from fastapi import FastAPI
|
| 102 |
from fastapi.staticfiles import StaticFiles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
from pydantic import BaseModel
|
| 104 |
from typing import List, Dict, Any, Optional
|
| 105 |
|
|
@@ -741,6 +747,39 @@ class HIPAACompliantThesisAnalyzer:
|
|
| 741 |
except Exception as e:
|
| 742 |
self.hipaa_logger.log_access(self.user_id, "SUMMARY_ERROR", pdf_path, success=False)
|
| 743 |
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 744 |
|
| 745 |
def process_questions_only(self, pdf_path, questions, output_file=None):
|
| 746 |
"""Process document for Q&A only"""
|
|
@@ -769,6 +808,33 @@ class HIPAACompliantThesisAnalyzer:
|
|
| 769 |
except Exception as e:
|
| 770 |
self.hipaa_logger.log_access(self.user_id, "QA_ERROR", pdf_path, success=False)
|
| 771 |
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
|
| 773 |
def process_annotations_only(self, pdf_path, output_file=None):
|
| 774 |
"""Process document for PubTator annotations only"""
|
|
@@ -803,6 +869,68 @@ class HIPAACompliantThesisAnalyzer:
|
|
| 803 |
except Exception as e:
|
| 804 |
self.hipaa_logger.log_access(self.user_id, "ANNOTATION_ERROR", pdf_path, success=False)
|
| 805 |
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 806 |
|
| 807 |
def _extract_text_and_images(self, pdf_path):
|
| 808 |
"""Securely extract text and images from PDF"""
|
|
@@ -1163,6 +1291,340 @@ def get_annotations(req: AnalyzeReq):
|
|
| 1163 |
except Exception as e:
|
| 1164 |
print(f"Error in get_annotations: {e}")
|
| 1165 |
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1166 |
|
| 1167 |
@app.post('/analyze')
|
| 1168 |
def analyze(req: AnalyzeReq):
|
|
|
|
| 98 |
import numpy as np
|
| 99 |
import requests
|
| 100 |
import urllib3
|
| 101 |
+
from fastapi import FastAPI, UploadFile, File, Form
|
| 102 |
from fastapi.staticfiles import StaticFiles
|
| 103 |
+
try:
|
| 104 |
+
import psycopg2
|
| 105 |
+
PSYCOPG2_AVAILABLE = True
|
| 106 |
+
except ImportError:
|
| 107 |
+
PSYCOPG2_AVAILABLE = False
|
| 108 |
+
print("Warning: psycopg2 not available. Database features will be disabled.")
|
| 109 |
from pydantic import BaseModel
|
| 110 |
from typing import List, Dict, Any, Optional
|
| 111 |
|
|
|
|
| 747 |
except Exception as e:
|
| 748 |
self.hipaa_logger.log_access(self.user_id, "SUMMARY_ERROR", pdf_path, success=False)
|
| 749 |
raise e
|
| 750 |
+
def process_summary_only_from_text(self, text_content, output_file=None):
|
| 751 |
+
"""Process text content for summary only (no file extraction)"""
|
| 752 |
+
try:
|
| 753 |
+
# Generate summary
|
| 754 |
+
summary = self._generate_summary_secure(text_content)
|
| 755 |
+
key_terms = self._extract_key_terms(text_content)
|
| 756 |
+
sections = self._extract_key_sections(text_content)
|
| 757 |
+
|
| 758 |
+
doc_hash = self.calculate_document_hash(text_content)
|
| 759 |
+
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "SUMMARY_COMPLETE")
|
| 760 |
+
|
| 761 |
+
report = {
|
| 762 |
+
"hipaa_compliance": {
|
| 763 |
+
"processed_locally": True,
|
| 764 |
+
"user_id": self.user_id,
|
| 765 |
+
"document_hash": doc_hash,
|
| 766 |
+
"processing_timestamp": datetime.now().isoformat()
|
| 767 |
+
},
|
| 768 |
+
"text_analysis": {
|
| 769 |
+
"summary": summary,
|
| 770 |
+
"key_terms": key_terms[:15],
|
| 771 |
+
"sections_found": list(sections.keys())
|
| 772 |
+
}
|
| 773 |
+
}
|
| 774 |
+
|
| 775 |
+
if output_file:
|
| 776 |
+
self.secure_handler.secure_save(report, output_file)
|
| 777 |
+
|
| 778 |
+
return report
|
| 779 |
+
except Exception as e:
|
| 780 |
+
self.hipaa_logger.log_access(self.user_id, "SUMMARY_ERROR", "DB_CONTENT", success=False)
|
| 781 |
+
print(f"Error in process_summary_only_from_text: {e}")
|
| 782 |
+
raise e
|
| 783 |
|
| 784 |
def process_questions_only(self, pdf_path, questions, output_file=None):
|
| 785 |
"""Process document for Q&A only"""
|
|
|
|
| 808 |
except Exception as e:
|
| 809 |
self.hipaa_logger.log_access(self.user_id, "QA_ERROR", pdf_path, success=False)
|
| 810 |
raise e
|
| 811 |
+
def process_questions_only_from_text(self, text_content, questions, output_file=None):
|
| 812 |
+
"""Process text content for Q&A only (no file extraction)"""
|
| 813 |
+
try:
|
| 814 |
+
# Generate answers
|
| 815 |
+
question_answers = self._answer_questions_secure(questions, text_content)
|
| 816 |
+
|
| 817 |
+
doc_hash = self.calculate_document_hash(text_content)
|
| 818 |
+
self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "QA_COMPLETE")
|
| 819 |
+
|
| 820 |
+
report = {
|
| 821 |
+
"hipaa_compliance": {
|
| 822 |
+
"processed_locally": True,
|
| 823 |
+
"user_id": self.user_id,
|
| 824 |
+
"document_hash": doc_hash,
|
| 825 |
+
"processing_timestamp": datetime.now().isoformat()
|
| 826 |
+
},
|
| 827 |
+
"question_responses": question_answers
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
if output_file:
|
| 831 |
+
self.secure_handler.secure_save(report, output_file)
|
| 832 |
+
|
| 833 |
+
return report
|
| 834 |
+
except Exception as e:
|
| 835 |
+
self.hipaa_logger.log_access(self.user_id, "QA_ERROR", "DB_CONTENT", success=False)
|
| 836 |
+
print(f"Error in process_questions_only_from_text: {e}")
|
| 837 |
+
raise e
|
| 838 |
|
| 839 |
def process_annotations_only(self, pdf_path, output_file=None):
|
| 840 |
"""Process document for PubTator annotations only"""
|
|
|
|
| 869 |
except Exception as e:
|
| 870 |
self.hipaa_logger.log_access(self.user_id, "ANNOTATION_ERROR", pdf_path, success=False)
|
| 871 |
raise e
|
| 872 |
+
|
| 873 |
+
def save_to_database(self, pdf_path, pdf_upload_id):
|
| 874 |
+
"""Extract text from PDF and update existing record in PostgreSQL database"""
|
| 875 |
+
if not pdf_upload_id:
|
| 876 |
+
raise ValueError("pdf_upload_id is required for database update")
|
| 877 |
+
|
| 878 |
+
combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
|
| 879 |
+
|
| 880 |
+
db_config = {
|
| 881 |
+
"host": os.getenv("DB_HOST", "localhost"),
|
| 882 |
+
"database": os.getenv("DB_NAME", "Scholarly"),
|
| 883 |
+
"user": os.getenv("DB_USER", "postgres"),
|
| 884 |
+
"password": os.getenv("DB_PASSWORD", "admin")
|
| 885 |
+
}
|
| 886 |
+
|
| 887 |
+
conn = None
|
| 888 |
+
try:
|
| 889 |
+
# Connect to the database
|
| 890 |
+
conn = psycopg2.connect(**db_config)
|
| 891 |
+
cur = conn.cursor()
|
| 892 |
+
|
| 893 |
+
# Update the content of the existing record
|
| 894 |
+
update_query = """
|
| 895 |
+
UPDATE tbl_pdf_uploads
|
| 896 |
+
SET content = %s
|
| 897 |
+
WHERE id = %s
|
| 898 |
+
RETURNING id;
|
| 899 |
+
"""
|
| 900 |
+
|
| 901 |
+
cur.execute(update_query, (combined_text, pdf_upload_id))
|
| 902 |
+
|
| 903 |
+
# Check if any row was updated
|
| 904 |
+
row = cur.fetchone()
|
| 905 |
+
if not row:
|
| 906 |
+
raise Exception(f"No record found with id {pdf_upload_id}")
|
| 907 |
+
|
| 908 |
+
updated_id = row[0]
|
| 909 |
+
|
| 910 |
+
conn.commit()
|
| 911 |
+
|
| 912 |
+
self.hipaa_logger.log_access(self.user_id, "DB_UPDATE", pdf_path)
|
| 913 |
+
print(f"Document content updated in database. ID: {updated_id}")
|
| 914 |
+
|
| 915 |
+
return {
|
| 916 |
+
"status": "success",
|
| 917 |
+
"message": "Content updated in database",
|
| 918 |
+
"db_id": updated_id,
|
| 919 |
+
"document_hash": doc_hash
|
| 920 |
+
}
|
| 921 |
+
|
| 922 |
+
except psycopg2.Error as e:
|
| 923 |
+
if conn:
|
| 924 |
+
conn.rollback()
|
| 925 |
+
self.hipaa_logger.log_access(self.user_id, "DB_UPDATE_ERROR", pdf_path, success=False)
|
| 926 |
+
print(f"Database error: {e}")
|
| 927 |
+
raise e
|
| 928 |
+
except Exception as e:
|
| 929 |
+
print(f"Error updating database: {e}")
|
| 930 |
+
raise e
|
| 931 |
+
finally:
|
| 932 |
+
if conn:
|
| 933 |
+
conn.close()
|
| 934 |
|
| 935 |
def _extract_text_and_images(self, pdf_path):
|
| 936 |
"""Securely extract text and images from PDF"""
|
|
|
|
| 1291 |
except Exception as e:
|
| 1292 |
print(f"Error in get_annotations: {e}")
|
| 1293 |
return {"error": str(e)}
|
| 1294 |
+
|
| 1295 |
+
@app.post('/upload_db')
|
| 1296 |
+
async def upload_db(upload_db: str = Form(...), pdf_file: UploadFile = File(...)):
|
| 1297 |
+
"""Read PDF, extract text & images + OCR, and save content to database"""
|
| 1298 |
+
if not PSYCOPG2_AVAILABLE:
|
| 1299 |
+
return {"error": "Database features are not available. Please install psycopg2."}
|
| 1300 |
+
|
| 1301 |
+
conn = None
|
| 1302 |
+
try:
|
| 1303 |
+
# 1. Extract content (Text + Images)
|
| 1304 |
+
text_content = ""
|
| 1305 |
+
ocr_text_content = ""
|
| 1306 |
+
combined_text = ""
|
| 1307 |
+
|
| 1308 |
+
try:
|
| 1309 |
+
# Read stream from UploadFile
|
| 1310 |
+
pdf_stream = await pdf_file.read()
|
| 1311 |
+
|
| 1312 |
+
# Use PyMuPDF with the stream
|
| 1313 |
+
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
| 1314 |
+
|
| 1315 |
+
extracted_text = []
|
| 1316 |
+
extracted_images = []
|
| 1317 |
+
|
| 1318 |
+
for page_num, page in enumerate(doc):
|
| 1319 |
+
# Extract text
|
| 1320 |
+
page_text = page.get_text()
|
| 1321 |
+
extracted_text.append(page_text)
|
| 1322 |
+
|
| 1323 |
+
# Extract images
|
| 1324 |
+
image_list = page.get_images()
|
| 1325 |
+
for img_index, img in enumerate(image_list):
|
| 1326 |
+
try:
|
| 1327 |
+
xref = img[0]
|
| 1328 |
+
pix = fitz.Pixmap(doc, xref)
|
| 1329 |
+
|
| 1330 |
+
# Handle CMYK / Alpha - convert if needed
|
| 1331 |
+
if pix.n - pix.alpha < 4: # RGB or Gray
|
| 1332 |
+
pass
|
| 1333 |
+
else: # CMYK: convert to RGB
|
| 1334 |
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
| 1335 |
+
|
| 1336 |
+
img_data = pix.tobytes("ppm")
|
| 1337 |
+
img_pil = Image.open(io.BytesIO(img_data))
|
| 1338 |
+
|
| 1339 |
+
extracted_images.append({
|
| 1340 |
+
'page': page_num + 1,
|
| 1341 |
+
'index': img_index,
|
| 1342 |
+
'image': img_pil
|
| 1343 |
+
})
|
| 1344 |
+
pix = None
|
| 1345 |
+
except Exception as e:
|
| 1346 |
+
print(f"Error extracting image {img_index} on page {page_num}: {e}")
|
| 1347 |
+
|
| 1348 |
+
text_content = "\n".join(extracted_text)
|
| 1349 |
+
doc.close()
|
| 1350 |
+
|
| 1351 |
+
# 2. Perform OCR on extracted images
|
| 1352 |
+
if extracted_images:
|
| 1353 |
+
print(f"Performing OCR on {len(extracted_images)} images...")
|
| 1354 |
+
for img_info in extracted_images:
|
| 1355 |
+
try:
|
| 1356 |
+
img = img_info['image']
|
| 1357 |
+
if img.mode != 'RGB':
|
| 1358 |
+
img = img.convert('RGB')
|
| 1359 |
+
|
| 1360 |
+
# Preprocess (using logic similar to _perform_secure_ocr)
|
| 1361 |
+
processed_img = img
|
| 1362 |
+
|
| 1363 |
+
if OPENCV_AVAILABLE:
|
| 1364 |
+
img_array = np.array(img)
|
| 1365 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 1366 |
+
denoised = cv2.medianBlur(gray, 3)
|
| 1367 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
|
| 1368 |
+
enhanced = clahe.apply(denoised)
|
| 1369 |
+
_, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 1370 |
+
processed_img = Image.fromarray(thresh)
|
| 1371 |
+
else:
|
| 1372 |
+
gray = img.convert('L')
|
| 1373 |
+
enhancer = ImageEnhance.Contrast(gray)
|
| 1374 |
+
enhanced = enhancer.enhance(2.0)
|
| 1375 |
+
processed_img = enhanced.filter(ImageFilter.SHARPEN)
|
| 1376 |
+
|
| 1377 |
+
# OCR
|
| 1378 |
+
ocr_result = pytesseract.image_to_string(processed_img, config='--psm 6')
|
| 1379 |
+
if ocr_result.strip():
|
| 1380 |
+
ocr_text_content += f" {ocr_result.strip()}"
|
| 1381 |
+
|
| 1382 |
+
except Exception as e:
|
| 1383 |
+
print(f"OCR failed for image {img_info['index']}: {e}")
|
| 1384 |
+
|
| 1385 |
+
# 3. Combine Text
|
| 1386 |
+
combined_text = text_content + "\n" + ocr_text_content
|
| 1387 |
+
# Clean up whitespace
|
| 1388 |
+
combined_text = re.sub(r'\s+', ' ', combined_text).strip()
|
| 1389 |
+
|
| 1390 |
+
except Exception as e:
|
| 1391 |
+
print(f"Error extracting PDF content: {e}")
|
| 1392 |
+
return {"error": f"PDF extraction failed: {str(e)}"}
|
| 1393 |
+
|
| 1394 |
+
# Database configuration
|
| 1395 |
+
db_config = {
|
| 1396 |
+
"host": os.getenv("DB_HOST", "localhost"),
|
| 1397 |
+
"database": os.getenv("DB_NAME", "Scholarly"),
|
| 1398 |
+
"user": os.getenv("DB_USER", "postgres"),
|
| 1399 |
+
"password": os.getenv("DB_PASSWORD", "admin")
|
| 1400 |
+
}
|
| 1401 |
+
|
| 1402 |
+
# Update database
|
| 1403 |
+
try:
|
| 1404 |
+
conn = psycopg2.connect(**db_config)
|
| 1405 |
+
cur = conn.cursor()
|
| 1406 |
+
|
| 1407 |
+
update_query = """
|
| 1408 |
+
UPDATE tbl_pdf_uploads
|
| 1409 |
+
SET content = %s
|
| 1410 |
+
WHERE pdf_uploaded_id = %s
|
| 1411 |
+
RETURNING pdf_uploaded_id;
|
| 1412 |
+
"""
|
| 1413 |
+
|
| 1414 |
+
cur.execute(update_query, (combined_text, upload_db))
|
| 1415 |
+
|
| 1416 |
+
row = cur.fetchone()
|
| 1417 |
+
if not row:
|
| 1418 |
+
return {"error": f"No record found with id {upload_db}"}
|
| 1419 |
+
|
| 1420 |
+
updated_id = row[0]
|
| 1421 |
+
conn.commit()
|
| 1422 |
+
|
| 1423 |
+
print(f"Document content updated in database. ID: {updated_id}")
|
| 1424 |
+
return {
|
| 1425 |
+
"status": "success",
|
| 1426 |
+
"message": "Content updated in database",
|
| 1427 |
+
"db_id": updated_id
|
| 1428 |
+
}
|
| 1429 |
+
|
| 1430 |
+
except psycopg2.Error as e:
|
| 1431 |
+
if conn:
|
| 1432 |
+
conn.rollback()
|
| 1433 |
+
print(f"Database error: {e}")
|
| 1434 |
+
return {"error": f"Database error: {str(e)}"}
|
| 1435 |
+
|
| 1436 |
+
except Exception as e:
|
| 1437 |
+
print(f"Error in upload_db: {e}")
|
| 1438 |
+
return {"error": str(e)}
|
| 1439 |
+
finally:
|
| 1440 |
+
if conn:
|
| 1441 |
+
conn.close()
|
| 1442 |
+
|
| 1443 |
+
|
| 1444 |
+
class ExtractFromUrlRequest(BaseModel):
|
| 1445 |
+
"""Request model for extracting content from a document URL"""
|
| 1446 |
+
document_url: str
|
| 1447 |
+
verify_ssl: Optional[bool] = None # None = auto-detect (disabled for localhost)
|
| 1448 |
+
|
| 1449 |
+
|
| 1450 |
+
def extract_content_from_pdf_stream(pdf_stream: bytes) -> dict:
|
| 1451 |
+
"""
|
| 1452 |
+
Extract text and images with OCR from a PDF byte stream.
|
| 1453 |
+
|
| 1454 |
+
Args:
|
| 1455 |
+
pdf_stream: PDF file content as bytes
|
| 1456 |
+
|
| 1457 |
+
Returns:
|
| 1458 |
+
dict with text_content, ocr_text_content, combined_text, extracted_images count
|
| 1459 |
+
"""
|
| 1460 |
+
text_content = ""
|
| 1461 |
+
ocr_text_content = ""
|
| 1462 |
+
combined_text = ""
|
| 1463 |
+
extracted_images = []
|
| 1464 |
+
|
| 1465 |
+
# Use PyMuPDF with the stream
|
| 1466 |
+
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
| 1467 |
+
|
| 1468 |
+
extracted_text = []
|
| 1469 |
+
|
| 1470 |
+
for page_num, page in enumerate(doc):
|
| 1471 |
+
# Extract text
|
| 1472 |
+
page_text = page.get_text()
|
| 1473 |
+
extracted_text.append(page_text)
|
| 1474 |
+
|
| 1475 |
+
# Extract images
|
| 1476 |
+
image_list = page.get_images()
|
| 1477 |
+
for img_index, img in enumerate(image_list):
|
| 1478 |
+
try:
|
| 1479 |
+
xref = img[0]
|
| 1480 |
+
pix = fitz.Pixmap(doc, xref)
|
| 1481 |
+
|
| 1482 |
+
# Handle CMYK / Alpha - convert if needed
|
| 1483 |
+
if pix.n - pix.alpha < 4: # RGB or Gray
|
| 1484 |
+
pass
|
| 1485 |
+
else: # CMYK: convert to RGB
|
| 1486 |
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
| 1487 |
+
|
| 1488 |
+
img_data = pix.tobytes("ppm")
|
| 1489 |
+
img_pil = Image.open(io.BytesIO(img_data))
|
| 1490 |
+
|
| 1491 |
+
extracted_images.append({
|
| 1492 |
+
'page': page_num + 1,
|
| 1493 |
+
'index': img_index,
|
| 1494 |
+
'image': img_pil
|
| 1495 |
+
})
|
| 1496 |
+
pix = None
|
| 1497 |
+
except Exception as e:
|
| 1498 |
+
print(f"Error extracting image {img_index} on page {page_num}: {e}")
|
| 1499 |
+
|
| 1500 |
+
text_content = "\n".join(extracted_text)
|
| 1501 |
+
doc.close()
|
| 1502 |
+
|
| 1503 |
+
# Perform OCR on extracted images
|
| 1504 |
+
if extracted_images:
|
| 1505 |
+
print(f"Performing OCR on {len(extracted_images)} images...")
|
| 1506 |
+
for img_info in extracted_images:
|
| 1507 |
+
try:
|
| 1508 |
+
img = img_info['image']
|
| 1509 |
+
if img.mode != 'RGB':
|
| 1510 |
+
img = img.convert('RGB')
|
| 1511 |
+
|
| 1512 |
+
# Preprocess
|
| 1513 |
+
processed_img = img
|
| 1514 |
+
|
| 1515 |
+
if OPENCV_AVAILABLE:
|
| 1516 |
+
img_array = np.array(img)
|
| 1517 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 1518 |
+
denoised = cv2.medianBlur(gray, 3)
|
| 1519 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
|
| 1520 |
+
enhanced = clahe.apply(denoised)
|
| 1521 |
+
_, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 1522 |
+
processed_img = Image.fromarray(thresh)
|
| 1523 |
+
else:
|
| 1524 |
+
gray = img.convert('L')
|
| 1525 |
+
enhancer = ImageEnhance.Contrast(gray)
|
| 1526 |
+
enhanced = enhancer.enhance(2.0)
|
| 1527 |
+
processed_img = enhanced.filter(ImageFilter.SHARPEN)
|
| 1528 |
+
|
| 1529 |
+
# OCR
|
| 1530 |
+
ocr_result = pytesseract.image_to_string(processed_img, config='--psm 6')
|
| 1531 |
+
if ocr_result.strip():
|
| 1532 |
+
ocr_text_content += f" {ocr_result.strip()}"
|
| 1533 |
+
|
| 1534 |
+
except Exception as e:
|
| 1535 |
+
print(f"OCR failed for image {img_info['index']}: {e}")
|
| 1536 |
+
|
| 1537 |
+
# Combine Text
|
| 1538 |
+
combined_text = text_content + "\n" + ocr_text_content
|
| 1539 |
+
# Clean up whitespace
|
| 1540 |
+
combined_text = re.sub(r'\s+', ' ', combined_text).strip()
|
| 1541 |
+
|
| 1542 |
+
return {
|
| 1543 |
+
"text_content": text_content,
|
| 1544 |
+
"ocr_text_content": ocr_text_content,
|
| 1545 |
+
"combined_text": combined_text,
|
| 1546 |
+
"images_count": len(extracted_images)
|
| 1547 |
+
}
|
| 1548 |
+
|
| 1549 |
+
|
| 1550 |
+
def download_pdf_from_url(document_url: str, verify_ssl: Optional[bool] = None) -> bytes:
|
| 1551 |
+
"""
|
| 1552 |
+
Download PDF from URL and return as bytes.
|
| 1553 |
+
|
| 1554 |
+
Args:
|
| 1555 |
+
document_url: URL to download PDF from
|
| 1556 |
+
verify_ssl: Whether to verify SSL. None = auto-detect (disabled for localhost)
|
| 1557 |
+
|
| 1558 |
+
Returns:
|
| 1559 |
+
PDF content as bytes
|
| 1560 |
+
"""
|
| 1561 |
+
from urllib.parse import urlparse
|
| 1562 |
+
parsed_url = urlparse(document_url)
|
| 1563 |
+
hostname = parsed_url.hostname or ''
|
| 1564 |
+
|
| 1565 |
+
# Auto-disable SSL verification for localhost
|
| 1566 |
+
if verify_ssl is None:
|
| 1567 |
+
if hostname.lower() in ['localhost', '127.0.0.1', '::1']:
|
| 1568 |
+
verify_ssl = False
|
| 1569 |
+
print(f"Note: SSL verification disabled for localhost URL")
|
| 1570 |
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
| 1571 |
+
else:
|
| 1572 |
+
verify_ssl = True
|
| 1573 |
+
|
| 1574 |
+
# Download the file from URL
|
| 1575 |
+
print(f"Downloading document from URL: {document_url}")
|
| 1576 |
+
response = requests.get(document_url, timeout=30, stream=True, verify=verify_ssl)
|
| 1577 |
+
response.raise_for_status()
|
| 1578 |
+
|
| 1579 |
+
# Check if content type is PDF
|
| 1580 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 1581 |
+
if 'pdf' not in content_type and not document_url.lower().endswith('.pdf'):
|
| 1582 |
+
print(f"Warning: Content type is {content_type}, might not be a PDF")
|
| 1583 |
+
|
| 1584 |
+
return response.content
|
| 1585 |
+
|
| 1586 |
+
|
| 1587 |
+
@app.post('/extract_content')
|
| 1588 |
+
async def extract_content(req: ExtractFromUrlRequest):
|
| 1589 |
+
"""
|
| 1590 |
+
Read PDF from URL, extract text & images + OCR, and return content.
|
| 1591 |
+
Similar to upload_db but accepts URL instead of file and returns content instead of DB update.
|
| 1592 |
+
"""
|
| 1593 |
+
try:
|
| 1594 |
+
# 1. Download the document from URL
|
| 1595 |
+
pdf_stream = download_pdf_from_url(req.document_url, req.verify_ssl)
|
| 1596 |
+
|
| 1597 |
+
# 2. Extract content (Text + Images + OCR)
|
| 1598 |
+
extraction_result = extract_content_from_pdf_stream(pdf_stream)
|
| 1599 |
+
|
| 1600 |
+
# 3. Calculate document hash for tracking
|
| 1601 |
+
doc_hash = hashlib.sha256(pdf_stream).hexdigest()[:16]
|
| 1602 |
+
|
| 1603 |
+
print(f"Document extracted successfully from URL. Hash: {doc_hash}")
|
| 1604 |
+
|
| 1605 |
+
return {
|
| 1606 |
+
"status": "success",
|
| 1607 |
+
"message": "Content extracted from URL",
|
| 1608 |
+
"document_hash": doc_hash,
|
| 1609 |
+
"content": extraction_result["combined_text"],
|
| 1610 |
+
"statistics": {
|
| 1611 |
+
"text_length": len(extraction_result["text_content"]),
|
| 1612 |
+
"ocr_text_length": len(extraction_result["ocr_text_content"]),
|
| 1613 |
+
"combined_length": len(extraction_result["combined_text"]),
|
| 1614 |
+
"images_processed": extraction_result["images_count"]
|
| 1615 |
+
}
|
| 1616 |
+
}
|
| 1617 |
+
|
| 1618 |
+
except requests.exceptions.SSLError as e:
|
| 1619 |
+
error_msg = f"SSL certificate verification failed: {e}"
|
| 1620 |
+
print(error_msg)
|
| 1621 |
+
return {"error": error_msg, "hint": "For localhost, SSL verification is automatically disabled. For other domains with self-signed certs, consider using a trusted certificate."}
|
| 1622 |
+
except requests.exceptions.RequestException as e:
|
| 1623 |
+
print(f"Request error: {e}")
|
| 1624 |
+
return {"error": f"Failed to download from URL: {str(e)}"}
|
| 1625 |
+
except Exception as e:
|
| 1626 |
+
print(f"Error in extract_from_url: {e}")
|
| 1627 |
+
return {"error": str(e)}
|
| 1628 |
|
| 1629 |
@app.post('/analyze')
|
| 1630 |
def analyze(req: AnalyzeReq):
|
requirements.txt
CHANGED
|
@@ -13,3 +13,9 @@ torch==2.8.0
|
|
| 13 |
transformers==4.56.1
|
| 14 |
urllib3==2.2.0
|
| 15 |
uvicorn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
transformers==4.56.1
|
| 14 |
urllib3==2.2.0
|
| 15 |
uvicorn
|
| 16 |
+
scikit-learn==1.4.2
|
| 17 |
+
rank-bm25==0.2.2
|
| 18 |
+
sentence-transformers==2.7.0
|
| 19 |
+
pymupdf==1.24.9
|
| 20 |
+
textstat==0.7.4
|
| 21 |
+
psycopg2-binary==2.9.10
|