Spaces:

ApsidalSolid4
/

CITProjectAIDetector

Running

App Files Files Community

ApsidalSolid4 commited on Mar 21, 2025

Commit

0c3e4c0

verified ·

1 Parent(s): c2529af

Update app.py

Browse files

Files changed (1) hide show

app.py +193 -202

app.py CHANGED Viewed

@@ -18,6 +18,10 @@ from openpyxl.utils import get_column_letter
 from io import BytesIO
 import base64
 import hashlib
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -32,6 +36,17 @@ CONFIDENCE_THRESHOLD = 0.65
 BATCH_SIZE = 8  # Reduced batch size for CPU
 MAX_WORKERS = 4  # Number of worker threads for processing
 # Get password hash from environment variable (more secure)
 ADMIN_PASSWORD_HASH = os.environ.get('ADMIN_PASSWORD_HASH')
@@ -41,17 +56,6 @@ if not ADMIN_PASSWORD_HASH:
 # Excel file path for logs
 EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
-import requests
-import base64
-import os
-import tempfile
-from typing import Dict, List, Optional, Union, Tuple
-import mimetypes
-import logging
-import time
-from pathlib import Path
 # OCR API settings
 OCR_API_KEY = "9e11346f1288957"  # This is a partial key - replace with the full one
 OCR_API_ENDPOINT = "https://api.ocr.space/parse/image"
@@ -205,172 +209,6 @@ class OCRProcessor:
         return mime_type
-# Function to be integrated with the main application
-def handle_file_upload_and_analyze(file_obj, mode: str, classifier) -> tuple:
-    """
-    Handle file upload, OCR processing, and text analysis
-    Args:
-        file_obj: Uploaded file object from Gradio
-        mode: Analysis mode (quick or detailed)
-        classifier: The TextClassifier instance
-    Returns:
-        Analysis results as a tuple (same format as original analyze_text function)
-    """
-    if file_obj is None:
-        return (
-            "No file uploaded",
-            "Please upload a file to analyze",
-            "No file uploaded for analysis"
-        )
-    # Create a temporary file
-    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file_obj.name).suffix) as temp_file:
-        temp_file_path = temp_file.name
-        # Write uploaded file to the temporary file
-        temp_file.write(file_obj.read())
-    try:
-        # Process the file with OCR
-        ocr_processor = OCRProcessor()
-        ocr_result = ocr_processor.process_file(temp_file_path)
-        if not ocr_result["success"]:
-            return (
-                "OCR Processing Error",
-                ocr_result["error"],
-                "Failed to extract text from the uploaded file"
-            )
-        # Get the extracted text
-        extracted_text = ocr_result["text"]
-        # If no text was extracted
-        if not extracted_text.strip():
-            return (
-                "No text extracted",
-                "The OCR process did not extract any text from the uploaded file.",
-                "No text was found in the uploaded file"
-            )
-        # Call the original text analysis function with the extracted text
-        return analyze_text(extracted_text, mode, classifier)
-    finally:
-        # Clean up the temporary file
-        if os.path.exists(temp_file_path):
-            os.remove(temp_file_path)
-# Modified Gradio interface setup function to include file upload
-def setup_gradio_interface(classifier):
-    """
-    Set up Gradio interface with text input and file upload options
-    Args:
-        classifier: The TextClassifier instance
-    Returns:
-        Gradio Interface object
-    """
-    import gradio as gr
-    with gr.Blocks(title="AI Text Detector") as demo:
-        gr.Markdown("# AI Text Detector with Document Upload")
-        gr.Markdown("Analyze text to detect if it was written by a human or AI. You can paste text directly or upload images, PDFs, or Word documents.")
-        with gr.Tab("Text Input"):
-            text_input = gr.Textbox(
-                lines=8,
-                placeholder="Enter text to analyze...",
-                label="Input Text"
-            )
-            mode_selection = gr.Radio(
-                choices=["quick", "detailed"],
-                value="quick",
-                label="Analysis Mode",
-                info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
-            )
-            text_submit_button = gr.Button("Analyze Text")
-            output_html = gr.HTML(label="Highlighted Analysis")
-            output_sentences = gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10)
-            output_result = gr.Textbox(label="Overall Result", lines=4)
-            text_submit_button.click(
-                analyze_text,
-                inputs=[text_input, mode_selection, classifier],
-                outputs=[output_html, output_sentences, output_result]
-            )
-        with gr.Tab("File Upload"):
-            file_upload = gr.File(
-                label="Upload Document",
-                file_types=["image", "pdf", "doc", "docx"],
-                type="file"
-            )
-            file_mode_selection = gr.Radio(
-                choices=["quick", "detailed"],
-                value="quick",
-                label="Analysis Mode",
-                info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
-            )
-            upload_submit_button = gr.Button("Process and Analyze")
-            file_output_html = gr.HTML(label="Highlighted Analysis")
-            file_output_sentences = gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10)
-            file_output_result = gr.Textbox(label="Overall Result", lines=4)
-            upload_submit_button.click(
-                handle_file_upload_and_analyze,
-                inputs=[file_upload, file_mode_selection, classifier],
-                outputs=[file_output_html, file_output_sentences, file_output_result]
-            )
-            gr.Markdown("""
-            ### File Upload Limitations
-            - Maximum file size: 1MB
-            - PDF files: Maximum 3 pages (OCR.space API limitation)
-            - Supported formats: Images (PNG, JPG, GIF), PDF, Word documents (DOCX, DOC)
-            """)
-    return demo
-# This function is a replacement for the original main app setup
-def setup_app_with_ocr():
-    """
-    Setup the application with OCR capabilities
-    """
-    # Initialize the classifier (use existing code)
-    classifier = TextClassifier()
-    # Create the Gradio interface with file upload functionality
-    demo = setup_gradio_interface(classifier)
-    # Get the FastAPI app from Gradio
-    app = demo.app
-    # Add CORS middleware (same as original code)
-    from fastapi.middleware.cors import CORSMiddleware
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=["*"],  # For development
-        allow_credentials=True,
-        allow_methods=["GET", "POST", "OPTIONS"],
-        allow_headers=["*"],
-    )
-    # Return the demo for launching
-    return demo
 def is_admin_password(input_text: str) -> bool:
     """
     Check if the input text matches the admin password using secure hash comparison.
@@ -382,6 +220,7 @@ def is_admin_password(input_text: str) -> bool:
     # Compare hashes (constant-time comparison to prevent timing attacks)
     return input_hash == ADMIN_PASSWORD_HASH
 class TextWindowProcessor:
     def __init__(self):
         try:
@@ -433,13 +272,10 @@ class TextWindowProcessor:
         return windows, window_sentence_indices
 class TextClassifier:
     def __init__(self):
-        # Set thread configuration before any model loading or parallel work
-        if not torch.cuda.is_available():
-            torch.set_num_threads(MAX_WORKERS)
-            torch.set_num_interop_threads(MAX_WORKERS)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model_name = MODEL_NAME
         self.tokenizer = None
@@ -583,7 +419,7 @@ class TextClassifier:
                 for window_idx, indices in enumerate(batch_indices):
                     center_idx = len(indices) // 2
                     center_weight = 0.7  # Higher weight for center sentence
-                    edge_weight = 0.3 / (len(indices) - 1)  # Distribute remaining weight
                     for pos, sent_idx in enumerate(indices):
                         # Apply higher weight to center sentence
@@ -606,10 +442,10 @@ class TextClassifier:
                 # Apply minimal smoothing at prediction boundaries
                 if i > 0 and i < len(sentences) - 1:
-                    prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
-                    prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
-                    next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
-                    next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
                     # Check if we're at a prediction boundary
                     current_pred = 'human' if human_prob > ai_prob else 'ai'
@@ -684,6 +520,65 @@ class TextClassifier:
             'num_sentences': num_sentences
         }
 def initialize_excel_log():
     """Initialize the Excel log file if it doesn't exist."""
     if not os.path.exists(EXCEL_LOG_PATH):
@@ -711,6 +606,7 @@ def initialize_excel_log():
         wb.save(EXCEL_LOG_PATH)
         logger.info(f"Initialized Excel log file at {EXCEL_LOG_PATH}")
 def log_prediction_data(input_text, word_count, prediction, confidence, execution_time, mode):
     """Log prediction data to an Excel file in the /tmp directory."""
     # Initialize the Excel file if it doesn't exist
@@ -753,6 +649,7 @@ def log_prediction_data(input_text, word_count, prediction, confidence, executio
         logger.error(f"Error logging prediction data to Excel: {str(e)}")
         return False
 def get_logs_as_base64():
     """Read the Excel logs file and return as base64 for downloading."""
     if not os.path.exists(EXCEL_LOG_PATH):
@@ -771,6 +668,7 @@ def get_logs_as_base64():
         logger.error(f"Error reading Excel logs: {str(e)}")
         return None
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
     # Check if the input text matches the admin password using secure comparison
@@ -890,27 +788,120 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
             overall_result
         )
-# Initialize the classifier globally
-classifier = TextClassifier()
-# Create Gradio interface
-demo = setup_app_with_ocr()
-# Get the FastAPI app from Gradio
-app = demo.app
-# Add CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # For development
-    allow_credentials=True,
-    allow_methods=["GET", "POST", "OPTIONS"],
-    allow_headers=["*"],
-)
-# Ensure CORS is applied before launching
 if __name__ == "__main__":
     demo.queue()
     demo.launch(
         server_name="0.0.0.0",

 from io import BytesIO
 import base64
 import hashlib
+import requests
+import tempfile
+from pathlib import Path
+import mimetypes
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 BATCH_SIZE = 8  # Reduced batch size for CPU
 MAX_WORKERS = 4  # Number of worker threads for processing
+# IMPORTANT: Set PyTorch thread configuration at the module level
+# before any parallel work starts
+if not torch.cuda.is_available():
+    # Set thread configuration only once at the beginning
+    torch.set_num_threads(MAX_WORKERS)
+    try:
+        # Only set interop threads if it hasn't been set already
+        torch.set_num_interop_threads(MAX_WORKERS)
+    except RuntimeError as e:
+        logger.warning(f"Could not set interop threads: {str(e)}")
 # Get password hash from environment variable (more secure)
 ADMIN_PASSWORD_HASH = os.environ.get('ADMIN_PASSWORD_HASH')
 # Excel file path for logs
 EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
 # OCR API settings
 OCR_API_KEY = "9e11346f1288957"  # This is a partial key - replace with the full one
 OCR_API_ENDPOINT = "https://api.ocr.space/parse/image"
         return mime_type
 def is_admin_password(input_text: str) -> bool:
     """
     Check if the input text matches the admin password using secure hash comparison.
     # Compare hashes (constant-time comparison to prevent timing attacks)
     return input_hash == ADMIN_PASSWORD_HASH
 class TextWindowProcessor:
     def __init__(self):
         try:
         return windows, window_sentence_indices
 class TextClassifier:
     def __init__(self):
+        # FIXED: Removed the thread configuration here, as it's now at the module level
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model_name = MODEL_NAME
         self.tokenizer = None
                 for window_idx, indices in enumerate(batch_indices):
                     center_idx = len(indices) // 2
                     center_weight = 0.7  # Higher weight for center sentence
+                    edge_weight = 0.3 / (len(indices) - 1) if len(indices) > 1 else 0  # Distribute remaining weight
                     for pos, sent_idx in enumerate(indices):
                         # Apply higher weight to center sentence
                 # Apply minimal smoothing at prediction boundaries
                 if i > 0 and i < len(sentences) - 1:
+                    prev_human = sentence_scores[i-1]['human_prob'] / max(sentence_appearances[i-1], 1e-10)
+                    prev_ai = sentence_scores[i-1]['ai_prob'] / max(sentence_appearances[i-1], 1e-10)
+                    next_human = sentence_scores[i+1]['human_prob'] / max(sentence_appearances[i+1], 1e-10)
+                    next_ai = sentence_scores[i+1]['ai_prob'] / max(sentence_appearances[i+1], 1e-10)
                     # Check if we're at a prediction boundary
                     current_pred = 'human' if human_prob > ai_prob else 'ai'
             'num_sentences': num_sentences
         }
+# Function to handle file upload, OCR processing, and text analysis
+def handle_file_upload_and_analyze(file_obj, mode: str, classifier) -> tuple:
+    """
+    Handle file upload, OCR processing, and text analysis
+    Args:
+        file_obj: Uploaded file object from Gradio
+        mode: Analysis mode (quick or detailed)
+        classifier: The TextClassifier instance
+    Returns:
+        Analysis results as a tuple (same format as original analyze_text function)
+    """
+    if file_obj is None:
+        return (
+            "No file uploaded",
+            "Please upload a file to analyze",
+            "No file uploaded for analysis"
+        )
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file_obj.name).suffix) as temp_file:
+        temp_file_path = temp_file.name
+        # Write uploaded file to the temporary file
+        temp_file.write(file_obj.read())
+    try:
+        # Process the file with OCR
+        ocr_processor = OCRProcessor()
+        ocr_result = ocr_processor.process_file(temp_file_path)
+        if not ocr_result["success"]:
+            return (
+                "OCR Processing Error",
+                ocr_result["error"],
+                "Failed to extract text from the uploaded file"
+            )
+        # Get the extracted text
+        extracted_text = ocr_result["text"]
+        # If no text was extracted
+        if not extracted_text.strip():
+            return (
+                "No text extracted",
+                "The OCR process did not extract any text from the uploaded file.",
+                "No text was found in the uploaded file"
+            )
+        # Call the original text analysis function with the extracted text
+        return analyze_text(extracted_text, mode, classifier)
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
 def initialize_excel_log():
     """Initialize the Excel log file if it doesn't exist."""
     if not os.path.exists(EXCEL_LOG_PATH):
         wb.save(EXCEL_LOG_PATH)
         logger.info(f"Initialized Excel log file at {EXCEL_LOG_PATH}")
 def log_prediction_data(input_text, word_count, prediction, confidence, execution_time, mode):
     """Log prediction data to an Excel file in the /tmp directory."""
     # Initialize the Excel file if it doesn't exist
         logger.error(f"Error logging prediction data to Excel: {str(e)}")
         return False
 def get_logs_as_base64():
     """Read the Excel logs file and return as base64 for downloading."""
     if not os.path.exists(EXCEL_LOG_PATH):
         logger.error(f"Error reading Excel logs: {str(e)}")
         return None
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
     # Check if the input text matches the admin password using secure comparison
             overall_result
         )
+# Modified Gradio interface setup function to include file upload
+def setup_gradio_interface(classifier):
+    """
+    Set up Gradio interface with text input and file upload options
+    Args:
+        classifier: The TextClassifier instance
+    Returns:
+        Gradio Interface object
+    """
+    import gradio as gr
+    with gr.Blocks(title="AI Text Detector") as demo:
+        gr.Markdown("# AI Text Detector with Document Upload")
+        gr.Markdown("Analyze text to detect if it was written by a human or AI. You can paste text directly or upload images, PDFs, or Word documents.")
+        with gr.Tab("Text Input"):
+            text_input = gr.Textbox(
+                lines=8,
+                placeholder="Enter text to analyze...",
+                label="Input Text"
+            )
+            mode_selection = gr.Radio(
+                choices=["quick", "detailed"],
+                value="quick",
+                label="Analysis Mode",
+                info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
+            )
+            text_submit_button = gr.Button("Analyze Text")
+            output_html = gr.HTML(label="Highlighted Analysis")
+            output_sentences = gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10)
+            output_result = gr.Textbox(label="Overall Result", lines=4)
+            text_submit_button.click(
+                analyze_text,
+                inputs=[text_input, mode_selection, classifier],
+                outputs=[output_html, output_sentences, output_result]
+            )
+        with gr.Tab("File Upload"):
+            file_upload = gr.File(
+                label="Upload Document",
+                file_types=["image", "pdf", "doc", "docx"],
+                type="file"
+            )
+            file_mode_selection = gr.Radio(
+                choices=["quick", "detailed"],
+                value="quick",
+                label="Analysis Mode",
+                info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
+            )
+            upload_submit_button = gr.Button("Process and Analyze")
+            file_output_html = gr.HTML(label="Highlighted Analysis")
+            file_output_sentences = gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10)
+            file_output_result = gr.Textbox(label="Overall Result", lines=4)
+            upload_submit_button.click(
+                handle_file_upload_and_analyze,
+                inputs=[file_upload, file_mode_selection, classifier],
+                outputs=[file_output_html, file_output_sentences, file_output_result]
+            )
+            gr.Markdown("""
+            ### File Upload Limitations
+            - Maximum file size: 1MB
+            - PDF files: Maximum 3 pages (OCR.space API limitation)
+            - Supported formats: Images (PNG, JPG, GIF), PDF, Word documents (DOCX, DOC)
+            """)
+    return demo
+# This function is a replacement for the original main app setup
+def setup_app_with_ocr():
+    """
+    Setup the application with OCR capabilities
+    """
+    # Initialize the classifier (uses the fixed class)
+    classifier = TextClassifier()
+    # Create the Gradio interface with file upload functionality
+    demo = setup_gradio_interface(classifier)
+    # Get the FastAPI app from Gradio
+    app = demo.app
+    # Add CORS middleware (same as original code)
+    from fastapi.middleware.cors import CORSMiddleware
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],  # For development
+        allow_credentials=True,
+        allow_methods=["GET", "POST", "OPTIONS"],
+        allow_headers=["*"],
+    )
+    # Return the demo for launching
+    return demo
+# Initialize the application
 if __name__ == "__main__":
+    # Create the app with OCR functionality
+    demo = setup_app_with_ocr()
+    # Start the server
     demo.queue()
     demo.launch(
         server_name="0.0.0.0",