Spaces:

Krish-05
/

text-extraction-api

Sleeping

File size: 3,048 Bytes

import os
import shutil
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# --- Paths ---
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
UPLOAD_DIR = os.path.join(BASE_DIR, "uploads")
STATIC_DIR = os.path.join(BASE_DIR, "static")

# Create uploads directory if it doesn't exist
os.makedirs(UPLOAD_DIR, exist_ok=True)

# --- File Upload Settings ---
MAX_FILE_SIZE_MB = 50
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
ALLOWED_EXTENSIONS = {
    "pdf": "application/pdf",
    "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "png": "image/png",
    "jpg": "image/jpeg",
    "jpeg": "image/jpeg",
    "tiff": "image/tiff",
    "bmp": "image/bmp",
    "webp": "image/webp",
}

# --- OCR Configuration ---
# EasyOCR settings
EASYOCR_LANGS = ["en"]  # Languages to support
EASYOCR_GPU = False      # Set to True if NVIDIA GPU is available and CUDA is installed

# Keep Tesseract as fallback if needed, but prioritize EasyOCR for accuracy
def find_tesseract():
    """Auto-detect Tesseract installation path on Windows."""
    import shutil
    tesseract_in_path = shutil.which("tesseract")
    if tesseract_in_path:
        return tesseract_in_path

    common_paths = [
        r"C:\Program Files\Tesseract-OCR\tesseract.exe",
        r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
        r"C:\Users\{}\AppData\Local\Tesseract-OCR\tesseract.exe".format(os.getenv("USERNAME", "")),
    ]
    for path in common_paths:
        if os.path.isfile(path):
            return path
    return None

TESSERACT_CMD = find_tesseract()
TESSERACT_LANG = "eng"

def check_ocr_availability():
    """Check if any OCR engine is available."""
    try:
        import easyocr
        return "available"
    except ImportError:
        if TESSERACT_CMD:
            return "tesseract-only"
        return "not-found"

# --- Summarization Settings ---
SUMMARY_SENTENCE_COUNT = 5
SUMMARY_ALGORITHM = "lex-rank"  # Options: lex-rank, lsa, luhn, edmundson

# --- NER Settings ---
SPACY_MODEL = "en_core_web_sm"
NER_ENTITY_TYPES = ["PERSON", "ORG", "DATE", "MONEY", "GPE", "EVENT", "PRODUCT", "LAW", "NORP"]

# --- Sentiment Settings ---
SENTIMENT_THRESHOLDS = {
    "positive": 0.05,
    "negative": -0.05,
}

# --- Gemini AI Configuration ---
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")

# API access key for external clients
API_ACCESS_KEY = (
    os.getenv("API_ACCESS_KEY") or
    os.getenv("VALID_API_KEY") or
    os.getenv("API_KEY")
)

# Competition/deployment toggle:
# When false, endpoints are public and evaluators can call APIs without auth headers.
REQUIRE_API_KEY = os.getenv("REQUIRE_API_KEY", "false").strip().lower() == "true"

def is_api_key_valid(key: str) -> bool:
    if not REQUIRE_API_KEY:
        return True
    return bool(API_ACCESS_KEY and key and key.strip() == API_ACCESS_KEY)

# Flag to check if Gemini is configured
def is_gemini_available():
    return bool(GEMINI_API_KEY)