sairika commited on
Commit
db73aaa
Β·
verified Β·
1 Parent(s): e93d2d3

Create config.py

Browse files
Files changed (1) hide show
  1. config.py +143 -0
config.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ class Config:
5
+ """Configuration class for Smart RAG API"""
6
+
7
+ # Base directories
8
+ BASE_DIR = Path(__file__).parent
9
+ UPLOAD_DIR = BASE_DIR / "uploads"
10
+ VECTOR_STORE_DIR = BASE_DIR / "vector_store"
11
+ TEMP_DIR = BASE_DIR / "temp"
12
+
13
+ # File processing
14
+ MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", 10 * 1024 * 1024)) # 10MB default
15
+ ALLOWED_EXTENSIONS = {
16
+ '.pdf', '.docx', '.txt', '.jpg', '.jpeg', '.png', '.csv', '.db'
17
+ }
18
+
19
+ # Text chunking
20
+ CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", 500))
21
+ CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", 50))
22
+
23
+ # Hugging Face Models (Free)
24
+ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
25
+
26
+ # LLM Model options (choose based on performance needs)
27
+ LLM_MODEL = os.getenv("LLM_MODEL", "google/flan-t5-base")
28
+ # Alternative models:
29
+ # "microsoft/DialoGPT-medium" - for conversational responses
30
+ # "google/flan-t5-small" - faster, smaller model
31
+ # "facebook/bart-large-cnn" - good for summarization
32
+
33
+ # Vector search
34
+ VECTOR_SEARCH_K = int(os.getenv("VECTOR_SEARCH_K", 5))
35
+ SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", 0.1))
36
+
37
+ # OCR settings
38
+ TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
39
+ OCR_LANGUAGE = os.getenv("OCR_LANGUAGE", "eng")
40
+
41
+ # API settings
42
+ API_HOST = os.getenv("API_HOST", "0.0.0.0")
43
+ API_PORT = int(os.getenv("API_PORT", 7860))
44
+
45
+ # Gradio settings
46
+ GRADIO_SHARE = os.getenv("GRADIO_SHARE", "true").lower() == "true"
47
+ GRADIO_DEBUG = os.getenv("GRADIO_DEBUG", "false").lower() == "true"
48
+
49
+ # Model cache directory (for Hugging Face models)
50
+ HF_CACHE_DIR = os.getenv("HF_HOME", BASE_DIR / "model_cache")
51
+
52
+ # Performance settings
53
+ TORCH_THREADS = int(os.getenv("TORCH_THREADS", 4))
54
+ USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
55
+
56
+ # Logging
57
+ LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
58
+
59
+ @classmethod
60
+ def setup_environment(cls):
61
+ """Setup environment variables and directories"""
62
+
63
+ # Set Hugging Face cache directory
64
+ os.environ["HF_HOME"] = str(cls.HF_CACHE_DIR)
65
+ os.environ["TRANSFORMERS_CACHE"] = str(cls.HF_CACHE_DIR)
66
+
67
+ # Set PyTorch settings
68
+ os.environ["OMP_NUM_THREADS"] = str(cls.TORCH_THREADS)
69
+ os.environ["MKL_NUM_THREADS"] = str(cls.TORCH_THREADS)
70
+
71
+ # Disable tokenizers parallelism warning
72
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
73
+
74
+ # Set Tesseract command if available
75
+ if os.path.exists(cls.TESSERACT_CMD):
76
+ import pytesseract
77
+ pytesseract.pytesseract.tesseract_cmd = cls.TESSERACT_CMD
78
+
79
+ # File type configurations
80
+ FILE_TYPE_CONFIG = {
81
+ '.pdf': {
82
+ 'icon': 'πŸ“„',
83
+ 'description': 'PDF Document',
84
+ 'processor': 'pdf'
85
+ },
86
+ '.docx': {
87
+ 'icon': 'πŸ“',
88
+ 'description': 'Word Document',
89
+ 'processor': 'docx'
90
+ },
91
+ '.txt': {
92
+ 'icon': 'πŸ“ƒ',
93
+ 'description': 'Text File',
94
+ 'processor': 'text'
95
+ },
96
+ '.jpg': {
97
+ 'icon': 'πŸ–ΌοΈ',
98
+ 'description': 'JPEG Image',
99
+ 'processor': 'image'
100
+ },
101
+ '.jpeg': {
102
+ 'icon': 'πŸ–ΌοΈ',
103
+ 'description': 'JPEG Image',
104
+ 'processor': 'image'
105
+ },
106
+ '.png': {
107
+ 'icon': 'πŸ–ΌοΈ',
108
+ 'description': 'PNG Image',
109
+ 'processor': 'image'
110
+ },
111
+ '.csv': {
112
+ 'icon': 'πŸ“Š',
113
+ 'description': 'CSV Data',
114
+ 'processor': 'csv'
115
+ },
116
+ '.db': {
117
+ 'icon': 'πŸ—„οΈ',
118
+ 'description': 'SQLite Database',
119
+ 'processor': 'database'
120
+ }
121
+ }
122
+
123
+ # Model configurations for different use cases
124
+ MODEL_CONFIGS = {
125
+ 'fast': {
126
+ 'embedding': 'sentence-transformers/all-MiniLM-L6-v2',
127
+ 'llm': 'google/flan-t5-small',
128
+ 'description': 'Fast processing, lower accuracy'
129
+ },
130
+ 'balanced': {
131
+ 'embedding': 'sentence-transformers/all-MiniLM-L6-v2',
132
+ 'llm': 'google/flan-t5-base',
133
+ 'description': 'Balanced speed and accuracy'
134
+ },
135
+ 'accurate': {
136
+ 'embedding': 'sentence-transformers/all-mpnet-base-v2',
137
+ 'llm': 'google/flan-t5-large',
138
+ 'description': 'Higher accuracy, slower processing'
139
+ }
140
+ }
141
+
142
+ # Initialize configuration
143
+ Config.setup_environment()