File size: 2,692 Bytes
15d9931
 
a2967ae
 
 
 
15d9931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2967ae
15d9931
a2967ae
15d9931
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os

from dotenv import load_dotenv # Import load_dotenv for local execution
# CRITICAL FIX: Load environment variables for local testing
load_dotenv(dotenv_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env.local'))

# --- OpenRouter DeepSeek API Configuration ---
# Your DeepSeek API key, fetched from environment variables.
# This should be set as a secret on Hugging Face Spaces.
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_R1_V3_API_KEY")
if DEEPSEEK_API_KEY:
    DEEPSEEK_API_KEY = DEEPSEEK_API_KEY.strip()

# Base URL for the OpenRouter API.
DEEPSEEK_API_URL = 'https://openrouter.ai/api/v1/chat/completions'

# Headers required for OpenRouter API authentication.
DEEPSEEK_HEADERS = {
    'Authorization': f'Bearer {DEEPSEEK_API_KEY}',
    'Content-Type': 'application/json'
}

# --- Embedding Model Configuration ---
# Name of the Hugging Face model for embeddings.
EMBEDDING_MODEL_NAME = 'BAAI/bge-m3'
# Use float16 for reduced memory usage if supported by hardware (e.g., GPU).
# Set to False if encountering issues on CPU.
EMBEDDING_MODEL_USE_FP16 = True 

# --- ChromaDB Configuration ---
# Directory where ChromaDB will persist its database files.
# This should be relative to your application's working directory.
CHROMADB_PERSIST_DIRECTORY = "./chroma_db"
# Name of the collection within ChromaDB where document chunks will be stored.
CHROMADB_COLLECTION_NAME = "pdf_documents_collection"

# --- Document Chunking Configuration ---
# Maximum size of text chunks for embedding and retrieval.
CHUNK_SIZE = 700
# Overlap between consecutive chunks to maintain context.
CHUNK_OVERLAP = 100

# --- LLM Response Parameters ---
# Temperature for the DeepSeek model. Lower values make output more deterministic.
LLM_TEMPERATURE = 0.5
# Maximum number of tokens the LLM can generate in a response.
LLM_MAX_TOKENS = 4096  # Adjusted to a more reasonable value for DeepSeek
# Max tokens for conversation history truncation (approximate, not exact token count)
LLM_HISTORY_MAX_TOKENS = 9192

# --- Tesseract and Poppler Configuration (Docker/Deployment Specific) ---
# Environment variables set in Dockerfile for Tesseract.
TESSDATA_PREFIX = os.getenv("TESSDATA_PREFIX", "/usr/share/tesseract-ocr/4.00/tessdata")
TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
# Path to Poppler's bin directory if not in system PATH (mostly for local Windows setup).
POPPLER_PATH = None # e.g., r'C:\path\to\poppler\bin'



# --- Firebase Configuration (for Conversational Memory) ---
# Base64 encoded JSON string of your Firebase Service Account Key.
# This should be set as a secret on Hugging Face Spaces.
FIREBASE_CONFIG_BASE64 = os.getenv("FIREBASE_CONFIG_BASE64")