Hemang Thakur
commited on
Commit
·
ffb491f
1
Parent(s):
88b2fda
updated file reading logic
Browse files- main.py +49 -12
- requirements.txt +1 -0
main.py
CHANGED
|
@@ -6,6 +6,7 @@ import shutil
|
|
| 6 |
import asyncio
|
| 7 |
import logging
|
| 8 |
import traceback
|
|
|
|
| 9 |
from httpx import AsyncClient, RequestError
|
| 10 |
from typing import List, Dict, Any, Optional
|
| 11 |
from fastapi.staticfiles import StaticFiles
|
|
@@ -131,19 +132,55 @@ async def process_query(user_query: str, sse_queue: asyncio.Queue):
|
|
| 131 |
if filename not in state["user_files_cache"]:
|
| 132 |
try:
|
| 133 |
await sse_queue.put(("step", "Reading User-Provided Files..."))
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
# Try
|
|
|
|
|
|
|
|
|
|
| 140 |
try:
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
# Add all cached file contents
|
| 149 |
for filename, content in state["user_files_cache"].items():
|
|
|
|
| 6 |
import asyncio
|
| 7 |
import logging
|
| 8 |
import traceback
|
| 9 |
+
from chardet import detect
|
| 10 |
from httpx import AsyncClient, RequestError
|
| 11 |
from typing import List, Dict, Any, Optional
|
| 12 |
from fastapi.staticfiles import StaticFiles
|
|
|
|
| 132 |
if filename not in state["user_files_cache"]:
|
| 133 |
try:
|
| 134 |
await sse_queue.put(("step", "Reading User-Provided Files..."))
|
| 135 |
+
|
| 136 |
+
# Always read as binary first
|
| 137 |
+
with open(file_path, 'rb') as f:
|
| 138 |
+
file_bytes = f.read()
|
| 139 |
+
|
| 140 |
+
# Try to decode with multiple strategies
|
| 141 |
+
file_content = None
|
| 142 |
+
|
| 143 |
+
# Strategy 1: Try UTF-8 with BOM handling
|
| 144 |
try:
|
| 145 |
+
# Handle UTF-8 BOM if present
|
| 146 |
+
if file_bytes.startswith(b'\xef\xbb\xbf'):
|
| 147 |
+
file_content = file_bytes[3:].decode('utf-8')
|
| 148 |
+
else:
|
| 149 |
+
file_content = file_bytes.decode('utf-8')
|
| 150 |
+
logger.info(f"Successfully decoded {filename} as UTF-8")
|
| 151 |
+
except UnicodeDecodeError:
|
| 152 |
+
# Strategy 2: Try other common encodings
|
| 153 |
+
for encoding in ['utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1', 'windows-1252']:
|
| 154 |
+
try:
|
| 155 |
+
file_content = file_bytes.decode(encoding)
|
| 156 |
+
logger.info(f"Successfully decoded {filename} with {encoding}")
|
| 157 |
+
break
|
| 158 |
+
except UnicodeDecodeError:
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
# Strategy 3: If all else fails, use chardet for detection
|
| 162 |
+
if file_content is None:
|
| 163 |
+
try:
|
| 164 |
+
|
| 165 |
+
detected = detect(file_bytes)
|
| 166 |
+
if detected['encoding']:
|
| 167 |
+
file_content = file_bytes.decode(detected['encoding'])
|
| 168 |
+
logger.info(f"Decoded {filename} with detected encoding: {detected['encoding']}")
|
| 169 |
+
except:
|
| 170 |
+
pass
|
| 171 |
+
|
| 172 |
+
# Final fallback: Use UTF-8 with replacement
|
| 173 |
+
if file_content is None:
|
| 174 |
+
file_content = file_bytes.decode('utf-8', errors='replace')
|
| 175 |
+
logger.warning(f"Had to use error replacement for {filename}")
|
| 176 |
+
|
| 177 |
+
# Store the decoded content
|
| 178 |
+
state["user_files_cache"][filename] = file_content
|
| 179 |
+
logger.info(f"Successfully cached file {filename}, length: {len(file_content)} chars")
|
| 180 |
+
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.error(f"Error reading file {filename}: {str(e)}")
|
| 183 |
+
state["user_files_cache"][filename] = "" # Cache empty to avoid retrying
|
| 184 |
|
| 185 |
# Add all cached file contents
|
| 186 |
for filename, content in state["user_files_cache"].items():
|
requirements.txt
CHANGED
|
@@ -3,6 +3,7 @@ aiohttp==3.10.10
|
|
| 3 |
anthropic==0.42.0
|
| 4 |
beautifulsoup4==4.12.3
|
| 5 |
bert_score==0.3.13
|
|
|
|
| 6 |
crawl4ai[all]==0.3.731
|
| 7 |
deepeval==2.0
|
| 8 |
fake_useragent==1.5.1
|
|
|
|
| 3 |
anthropic==0.42.0
|
| 4 |
beautifulsoup4==4.12.3
|
| 5 |
bert_score==0.3.13
|
| 6 |
+
chardet>=5.0.0
|
| 7 |
crawl4ai[all]==0.3.731
|
| 8 |
deepeval==2.0
|
| 9 |
fake_useragent==1.5.1
|