Spaces:

Manu-glitz
/

Extraction

Sleeping

App Files Files Community

glitz-dev commited on Jan 8

Commit

1fd1e18

1 Parent(s): 8e51ed8

extract conent & save data from app itself or return the response added

Browse files

Files changed (2) hide show

hipaathesis.py +463 -1
requirements.txt +6 -0

hipaathesis.py CHANGED Viewed

@@ -98,8 +98,14 @@ import shutil
 import numpy as np
 import requests
 import urllib3
-from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from typing import List, Dict, Any, Optional
@@ -741,6 +747,39 @@ class HIPAACompliantThesisAnalyzer:
         except Exception as e:
             self.hipaa_logger.log_access(self.user_id, "SUMMARY_ERROR", pdf_path, success=False)
             raise e
     def process_questions_only(self, pdf_path, questions, output_file=None):
         """Process document for Q&A only"""
@@ -769,6 +808,33 @@ class HIPAACompliantThesisAnalyzer:
         except Exception as e:
             self.hipaa_logger.log_access(self.user_id, "QA_ERROR", pdf_path, success=False)
             raise e
     def process_annotations_only(self, pdf_path, output_file=None):
         """Process document for PubTator annotations only"""
@@ -803,6 +869,68 @@ class HIPAACompliantThesisAnalyzer:
         except Exception as e:
             self.hipaa_logger.log_access(self.user_id, "ANNOTATION_ERROR", pdf_path, success=False)
             raise e
     def _extract_text_and_images(self, pdf_path):
         """Securely extract text and images from PDF"""
@@ -1163,6 +1291,340 @@ def get_annotations(req: AnalyzeReq):
     except Exception as e:
         print(f"Error in get_annotations: {e}")
         return {"error": str(e)}
 @app.post('/analyze')
 def analyze(req: AnalyzeReq):

 import numpy as np
 import requests
 import urllib3
+from fastapi import FastAPI, UploadFile, File, Form
 from fastapi.staticfiles import StaticFiles
+try:
+    import psycopg2
+    PSYCOPG2_AVAILABLE = True
+except ImportError:
+    PSYCOPG2_AVAILABLE = False
+    print("Warning: psycopg2 not available. Database features will be disabled.")
 from pydantic import BaseModel
 from typing import List, Dict, Any, Optional
         except Exception as e:
             self.hipaa_logger.log_access(self.user_id, "SUMMARY_ERROR", pdf_path, success=False)
             raise e
+    def process_summary_only_from_text(self, text_content, output_file=None):
+        """Process text content for summary only (no file extraction)"""
+        try:
+            # Generate summary
+            summary = self._generate_summary_secure(text_content)
+            key_terms = self._extract_key_terms(text_content)
+            sections = self._extract_key_sections(text_content)
+            doc_hash = self.calculate_document_hash(text_content)
+            self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "SUMMARY_COMPLETE")
+            report = {
+                "hipaa_compliance": {
+                    "processed_locally": True,
+                    "user_id": self.user_id,
+                    "document_hash": doc_hash,
+                    "processing_timestamp": datetime.now().isoformat()
+                },
+                "text_analysis": {
+                    "summary": summary,
+                    "key_terms": key_terms[:15],
+                    "sections_found": list(sections.keys())
+                }
+            }
+            if output_file:
+                self.secure_handler.secure_save(report, output_file)
+            return report
+        except Exception as e:
+            self.hipaa_logger.log_access(self.user_id, "SUMMARY_ERROR", "DB_CONTENT", success=False)
+            print(f"Error in process_summary_only_from_text: {e}")
+            raise e
     def process_questions_only(self, pdf_path, questions, output_file=None):
         """Process document for Q&A only"""
         except Exception as e:
             self.hipaa_logger.log_access(self.user_id, "QA_ERROR", pdf_path, success=False)
             raise e
+    def process_questions_only_from_text(self, text_content, questions, output_file=None):
+        """Process text content for Q&A only (no file extraction)"""
+        try:
+            # Generate answers
+            question_answers = self._answer_questions_secure(questions, text_content)
+            doc_hash = self.calculate_document_hash(text_content)
+            self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "QA_COMPLETE")
+            report = {
+                "hipaa_compliance": {
+                    "processed_locally": True,
+                    "user_id": self.user_id,
+                    "document_hash": doc_hash,
+                    "processing_timestamp": datetime.now().isoformat()
+                },
+                "question_responses": question_answers
+            }
+            if output_file:
+                self.secure_handler.secure_save(report, output_file)
+            return report
+        except Exception as e:
+            self.hipaa_logger.log_access(self.user_id, "QA_ERROR", "DB_CONTENT", success=False)
+            print(f"Error in process_questions_only_from_text: {e}")
+            raise e
     def process_annotations_only(self, pdf_path, output_file=None):
         """Process document for PubTator annotations only"""
         except Exception as e:
             self.hipaa_logger.log_access(self.user_id, "ANNOTATION_ERROR", pdf_path, success=False)
             raise e
+    def save_to_database(self, pdf_path, pdf_upload_id):
+        """Extract text from PDF and update existing record in PostgreSQL database"""
+        if not pdf_upload_id:
+            raise ValueError("pdf_upload_id is required for database update")
+        combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
+        db_config = {
+            "host": os.getenv("DB_HOST", "localhost"),
+            "database": os.getenv("DB_NAME", "Scholarly"),
+            "user": os.getenv("DB_USER", "postgres"),
+            "password": os.getenv("DB_PASSWORD", "admin")
+        }
+        conn = None
+        try:
+            # Connect to the database
+            conn = psycopg2.connect(**db_config)
+            cur = conn.cursor()
+            # Update the content of the existing record
+            update_query = """
+            UPDATE tbl_pdf_uploads
+            SET content = %s
+            WHERE id = %s
+            RETURNING id;
+            """
+            cur.execute(update_query, (combined_text, pdf_upload_id))
+            # Check if any row was updated
+            row = cur.fetchone()
+            if not row:
+                raise Exception(f"No record found with id {pdf_upload_id}")
+            updated_id = row[0]
+            conn.commit()
+            self.hipaa_logger.log_access(self.user_id, "DB_UPDATE", pdf_path)
+            print(f"Document content updated in database. ID: {updated_id}")
+            return {
+                "status": "success",
+                "message": "Content updated in database",
+                "db_id": updated_id,
+                "document_hash": doc_hash
+            }
+        except psycopg2.Error as e:
+            if conn:
+                conn.rollback()
+            self.hipaa_logger.log_access(self.user_id, "DB_UPDATE_ERROR", pdf_path, success=False)
+            print(f"Database error: {e}")
+            raise e
+        except Exception as e:
+            print(f"Error updating database: {e}")
+            raise e
+        finally:
+            if conn:
+                conn.close()
     def _extract_text_and_images(self, pdf_path):
         """Securely extract text and images from PDF"""
     except Exception as e:
         print(f"Error in get_annotations: {e}")
         return {"error": str(e)}
+@app.post('/upload_db')
+async def upload_db(upload_db: str = Form(...), pdf_file: UploadFile = File(...)):
+    """Read PDF, extract text & images + OCR, and save content to database"""
+    if not PSYCOPG2_AVAILABLE:
+        return {"error": "Database features are not available. Please install psycopg2."}
+    conn = None
+    try:
+        # 1. Extract content (Text + Images)
+        text_content = ""
+        ocr_text_content = ""
+        combined_text = ""
+        try:
+            # Read stream from UploadFile
+            pdf_stream = await pdf_file.read()
+            # Use PyMuPDF with the stream
+            doc = fitz.open(stream=pdf_stream, filetype="pdf")
+            extracted_text = []
+            extracted_images = []
+            for page_num, page in enumerate(doc):
+                # Extract text
+                page_text = page.get_text()
+                extracted_text.append(page_text)
+                # Extract images
+                image_list = page.get_images()
+                for img_index, img in enumerate(image_list):
+                    try:
+                        xref = img[0]
+                        pix = fitz.Pixmap(doc, xref)
+                        # Handle CMYK / Alpha - convert if needed
+                        if pix.n - pix.alpha < 4: # RGB or Gray
+                            pass
+                        else: # CMYK: convert to RGB
+                            pix = fitz.Pixmap(fitz.csRGB, pix)
+                        img_data = pix.tobytes("ppm")
+                        img_pil = Image.open(io.BytesIO(img_data))
+                        extracted_images.append({
+                            'page': page_num + 1,
+                            'index': img_index,
+                            'image': img_pil
+                        })
+                        pix = None
+                    except Exception as e:
+                        print(f"Error extracting image {img_index} on page {page_num}: {e}")
+            text_content = "\n".join(extracted_text)
+            doc.close()
+            # 2. Perform OCR on extracted images
+            if extracted_images:
+                print(f"Performing OCR on {len(extracted_images)} images...")
+                for img_info in extracted_images:
+                    try:
+                        img = img_info['image']
+                        if img.mode != 'RGB':
+                            img = img.convert('RGB')
+                        # Preprocess (using logic similar to _perform_secure_ocr)
+                        processed_img = img
+                        if OPENCV_AVAILABLE:
+                            img_array = np.array(img)
+                            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+                            denoised = cv2.medianBlur(gray, 3)
+                            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+                            enhanced = clahe.apply(denoised)
+                            _, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+                            processed_img = Image.fromarray(thresh)
+                        else:
+                            gray = img.convert('L')
+                            enhancer = ImageEnhance.Contrast(gray)
+                            enhanced = enhancer.enhance(2.0)
+                            processed_img = enhanced.filter(ImageFilter.SHARPEN)
+                        # OCR
+                        ocr_result = pytesseract.image_to_string(processed_img, config='--psm 6')
+                        if ocr_result.strip():
+                            ocr_text_content += f" {ocr_result.strip()}"
+                    except Exception as e:
+                        print(f"OCR failed for image {img_info['index']}: {e}")
+            # 3. Combine Text
+            combined_text = text_content + "\n" + ocr_text_content
+            # Clean up whitespace
+            combined_text = re.sub(r'\s+', ' ', combined_text).strip()
+        except Exception as e:
+            print(f"Error extracting PDF content: {e}")
+            return {"error": f"PDF extraction failed: {str(e)}"}
+        # Database configuration
+        db_config = {
+            "host": os.getenv("DB_HOST", "localhost"),
+            "database": os.getenv("DB_NAME", "Scholarly"),
+            "user": os.getenv("DB_USER", "postgres"),
+            "password": os.getenv("DB_PASSWORD", "admin")
+        }
+        # Update database
+        try:
+            conn = psycopg2.connect(**db_config)
+            cur = conn.cursor()
+            update_query = """
+            UPDATE tbl_pdf_uploads
+            SET content = %s
+            WHERE pdf_uploaded_id = %s
+            RETURNING pdf_uploaded_id;
+            """
+            cur.execute(update_query, (combined_text, upload_db))
+            row = cur.fetchone()
+            if not row:
+                return {"error": f"No record found with id {upload_db}"}
+            updated_id = row[0]
+            conn.commit()
+            print(f"Document content updated in database. ID: {updated_id}")
+            return {
+                "status": "success",
+                "message": "Content updated in database",
+                "db_id": updated_id
+            }
+        except psycopg2.Error as e:
+            if conn:
+                conn.rollback()
+            print(f"Database error: {e}")
+            return {"error": f"Database error: {str(e)}"}
+    except Exception as e:
+        print(f"Error in upload_db: {e}")
+        return {"error": str(e)}
+    finally:
+        if conn:
+            conn.close()
+class ExtractFromUrlRequest(BaseModel):
+    """Request model for extracting content from a document URL"""
+    document_url: str
+    verify_ssl: Optional[bool] = None  # None = auto-detect (disabled for localhost)
+def extract_content_from_pdf_stream(pdf_stream: bytes) -> dict:
+    """
+    Extract text and images with OCR from a PDF byte stream.
+    Args:
+        pdf_stream: PDF file content as bytes
+    Returns:
+        dict with text_content, ocr_text_content, combined_text, extracted_images count
+    """
+    text_content = ""
+    ocr_text_content = ""
+    combined_text = ""
+    extracted_images = []
+    # Use PyMuPDF with the stream
+    doc = fitz.open(stream=pdf_stream, filetype="pdf")
+    extracted_text = []
+    for page_num, page in enumerate(doc):
+        # Extract text
+        page_text = page.get_text()
+        extracted_text.append(page_text)
+        # Extract images
+        image_list = page.get_images()
+        for img_index, img in enumerate(image_list):
+            try:
+                xref = img[0]
+                pix = fitz.Pixmap(doc, xref)
+                # Handle CMYK / Alpha - convert if needed
+                if pix.n - pix.alpha < 4:  # RGB or Gray
+                    pass
+                else:  # CMYK: convert to RGB
+                    pix = fitz.Pixmap(fitz.csRGB, pix)
+                img_data = pix.tobytes("ppm")
+                img_pil = Image.open(io.BytesIO(img_data))
+                extracted_images.append({
+                    'page': page_num + 1,
+                    'index': img_index,
+                    'image': img_pil
+                })
+                pix = None
+            except Exception as e:
+                print(f"Error extracting image {img_index} on page {page_num}: {e}")
+    text_content = "\n".join(extracted_text)
+    doc.close()
+    # Perform OCR on extracted images
+    if extracted_images:
+        print(f"Performing OCR on {len(extracted_images)} images...")
+        for img_info in extracted_images:
+            try:
+                img = img_info['image']
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                # Preprocess
+                processed_img = img
+                if OPENCV_AVAILABLE:
+                    img_array = np.array(img)
+                    gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+                    denoised = cv2.medianBlur(gray, 3)
+                    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+                    enhanced = clahe.apply(denoised)
+                    _, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+                    processed_img = Image.fromarray(thresh)
+                else:
+                    gray = img.convert('L')
+                    enhancer = ImageEnhance.Contrast(gray)
+                    enhanced = enhancer.enhance(2.0)
+                    processed_img = enhanced.filter(ImageFilter.SHARPEN)
+                # OCR
+                ocr_result = pytesseract.image_to_string(processed_img, config='--psm 6')
+                if ocr_result.strip():
+                    ocr_text_content += f" {ocr_result.strip()}"
+            except Exception as e:
+                print(f"OCR failed for image {img_info['index']}: {e}")
+    # Combine Text
+    combined_text = text_content + "\n" + ocr_text_content
+    # Clean up whitespace
+    combined_text = re.sub(r'\s+', ' ', combined_text).strip()
+    return {
+        "text_content": text_content,
+        "ocr_text_content": ocr_text_content,
+        "combined_text": combined_text,
+        "images_count": len(extracted_images)
+    }
+def download_pdf_from_url(document_url: str, verify_ssl: Optional[bool] = None) -> bytes:
+    """
+    Download PDF from URL and return as bytes.
+    Args:
+        document_url: URL to download PDF from
+        verify_ssl: Whether to verify SSL. None = auto-detect (disabled for localhost)
+    Returns:
+        PDF content as bytes
+    """
+    from urllib.parse import urlparse
+    parsed_url = urlparse(document_url)
+    hostname = parsed_url.hostname or ''
+    # Auto-disable SSL verification for localhost
+    if verify_ssl is None:
+        if hostname.lower() in ['localhost', '127.0.0.1', '::1']:
+            verify_ssl = False
+            print(f"Note: SSL verification disabled for localhost URL")
+            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+        else:
+            verify_ssl = True
+    # Download the file from URL
+    print(f"Downloading document from URL: {document_url}")
+    response = requests.get(document_url, timeout=30, stream=True, verify=verify_ssl)
+    response.raise_for_status()
+    # Check if content type is PDF
+    content_type = response.headers.get('content-type', '').lower()
+    if 'pdf' not in content_type and not document_url.lower().endswith('.pdf'):
+        print(f"Warning: Content type is {content_type}, might not be a PDF")
+    return response.content
+@app.post('/extract_content')
+async def extract_content(req: ExtractFromUrlRequest):
+    """
+    Read PDF from URL, extract text & images + OCR, and return content.
+    Similar to upload_db but accepts URL instead of file and returns content instead of DB update.
+    """
+    try:
+        # 1. Download the document from URL
+        pdf_stream = download_pdf_from_url(req.document_url, req.verify_ssl)
+        # 2. Extract content (Text + Images + OCR)
+        extraction_result = extract_content_from_pdf_stream(pdf_stream)
+        # 3. Calculate document hash for tracking
+        doc_hash = hashlib.sha256(pdf_stream).hexdigest()[:16]
+        print(f"Document extracted successfully from URL. Hash: {doc_hash}")
+        return {
+            "status": "success",
+            "message": "Content extracted from URL",
+            "document_hash": doc_hash,
+            "content": extraction_result["combined_text"],
+            "statistics": {
+                "text_length": len(extraction_result["text_content"]),
+                "ocr_text_length": len(extraction_result["ocr_text_content"]),
+                "combined_length": len(extraction_result["combined_text"]),
+                "images_processed": extraction_result["images_count"]
+            }
+        }
+    except requests.exceptions.SSLError as e:
+        error_msg = f"SSL certificate verification failed: {e}"
+        print(error_msg)
+        return {"error": error_msg, "hint": "For localhost, SSL verification is automatically disabled. For other domains with self-signed certs, consider using a trusted certificate."}
+    except requests.exceptions.RequestException as e:
+        print(f"Request error: {e}")
+        return {"error": f"Failed to download from URL: {str(e)}"}
+    except Exception as e:
+        print(f"Error in extract_from_url: {e}")
+        return {"error": str(e)}
 @app.post('/analyze')
 def analyze(req: AnalyzeReq):

requirements.txt CHANGED Viewed

@@ -13,3 +13,9 @@ torch==2.8.0
 transformers==4.56.1
 urllib3==2.2.0
 uvicorn

 transformers==4.56.1
 urllib3==2.2.0
 uvicorn
+scikit-learn==1.4.2
+rank-bm25==0.2.2
+sentence-transformers==2.7.0
+pymupdf==1.24.9
+textstat==0.7.4
+psycopg2-binary==2.9.10