Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -107,6 +107,15 @@ class ModelManager:
|
|
| 107 |
|
| 108 |
model_manager = ModelManager()
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# File Handling
|
| 111 |
class FileHandler:
|
| 112 |
@staticmethod
|
|
@@ -118,6 +127,14 @@ class FileHandler:
|
|
| 118 |
return FileHandler._extract_from_docx(file_path)
|
| 119 |
elif ext == '.txt':
|
| 120 |
return FileHandler._extract_from_txt(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
else:
|
| 122 |
raise ValueError(f"Unsupported file type: {ext}")
|
| 123 |
|
|
@@ -136,6 +153,46 @@ class FileHandler:
|
|
| 136 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 137 |
return f.read()
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
# Text Processing
|
| 140 |
def simple_tokenize(text):
|
| 141 |
return text.split()
|
|
@@ -210,7 +267,9 @@ def optimize_query(
|
|
| 210 |
#print(llm.invoke('Hello'))
|
| 211 |
# Limit max time or set a timeout for LLM to avoid endless execution
|
| 212 |
optimized_queries = multi_query_retriever.invoke(query, max_time=30) # Timeout in seconds
|
| 213 |
-
|
|
|
|
|
|
|
| 214 |
return optimized_queries
|
| 215 |
|
| 216 |
|
|
@@ -1436,8 +1495,8 @@ def run_automated_tests_and_analyze(*args):
|
|
| 1436 |
'model_type': model_types,
|
| 1437 |
'model_name': [name.strip() for name in model_names.split(',')],
|
| 1438 |
'split_strategy': split_strategies,
|
| 1439 |
-
'chunk_size': [int(size.strip()) for size in chunk_sizes.split(',')],
|
| 1440 |
-
'overlap_size': [int(size.strip()) for size in overlap_sizes.split(',')],
|
| 1441 |
'vector_store_type': vector_store_types,
|
| 1442 |
'search_type': search_types,
|
| 1443 |
'top_k': [int(k.strip()) for k in top_k_values.split(',')],
|
|
|
|
| 107 |
|
| 108 |
model_manager = ModelManager()
|
| 109 |
|
| 110 |
+
# File Handling
|
| 111 |
+
import os
|
| 112 |
+
import json
|
| 113 |
+
import csv
|
| 114 |
+
import xml.etree.ElementTree as ET
|
| 115 |
+
import openpyxl # for handling .xlsx files
|
| 116 |
+
import pdfplumber
|
| 117 |
+
import docx
|
| 118 |
+
|
| 119 |
# File Handling
|
| 120 |
class FileHandler:
|
| 121 |
@staticmethod
|
|
|
|
| 127 |
return FileHandler._extract_from_docx(file_path)
|
| 128 |
elif ext == '.txt':
|
| 129 |
return FileHandler._extract_from_txt(file_path)
|
| 130 |
+
elif ext == '.xml':
|
| 131 |
+
return FileHandler._extract_from_xml(file_path)
|
| 132 |
+
elif ext == '.json':
|
| 133 |
+
return FileHandler._extract_from_json(file_path)
|
| 134 |
+
elif ext == '.xlsx':
|
| 135 |
+
return FileHandler._extract_from_xlsx(file_path)
|
| 136 |
+
elif ext == '.csv':
|
| 137 |
+
return FileHandler._extract_from_csv(file_path)
|
| 138 |
else:
|
| 139 |
raise ValueError(f"Unsupported file type: {ext}")
|
| 140 |
|
|
|
|
| 153 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 154 |
return f.read()
|
| 155 |
|
| 156 |
+
@staticmethod
|
| 157 |
+
def _extract_from_xml(file_path):
|
| 158 |
+
tree = ET.parse(file_path)
|
| 159 |
+
root = tree.getroot()
|
| 160 |
+
return FileHandler._extract_xml_text(root)
|
| 161 |
+
|
| 162 |
+
@staticmethod
|
| 163 |
+
def _extract_xml_text(element):
|
| 164 |
+
# Recursively extract text from XML elements
|
| 165 |
+
text = element.text or ""
|
| 166 |
+
for child in element:
|
| 167 |
+
text += FileHandler._extract_xml_text(child)
|
| 168 |
+
return text.strip()
|
| 169 |
+
|
| 170 |
+
@staticmethod
|
| 171 |
+
def _extract_from_json(file_path):
|
| 172 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 173 |
+
data = json.load(f)
|
| 174 |
+
return json.dumps(data, indent=4) # Pretty print JSON for readability
|
| 175 |
+
|
| 176 |
+
@staticmethod
|
| 177 |
+
def _extract_from_xlsx(file_path):
|
| 178 |
+
workbook = openpyxl.load_workbook(file_path)
|
| 179 |
+
sheet = workbook.active
|
| 180 |
+
data = []
|
| 181 |
+
for row in sheet.iter_rows(values_only=True):
|
| 182 |
+
data.append('\t'.join([str(cell) for cell in row if cell is not None]))
|
| 183 |
+
return '\n'.join(data)
|
| 184 |
+
|
| 185 |
+
@staticmethod
|
| 186 |
+
def _extract_from_csv(file_path):
|
| 187 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 188 |
+
reader = csv.reader(f)
|
| 189 |
+
data = []
|
| 190 |
+
for row in reader:
|
| 191 |
+
data.append(','.join(row))
|
| 192 |
+
return '\n'.join(data)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
|
| 196 |
# Text Processing
|
| 197 |
def simple_tokenize(text):
|
| 198 |
return text.split()
|
|
|
|
| 267 |
#print(llm.invoke('Hello'))
|
| 268 |
# Limit max time or set a timeout for LLM to avoid endless execution
|
| 269 |
optimized_queries = multi_query_retriever.invoke(query, max_time=30) # Timeout in seconds
|
| 270 |
+
print(optimized_queries)
|
| 271 |
+
print('---- optimize query 5 ----')
|
| 272 |
+
|
| 273 |
return optimized_queries
|
| 274 |
|
| 275 |
|
|
|
|
| 1495 |
'model_type': model_types,
|
| 1496 |
'model_name': [name.strip() for name in model_names.split(',')],
|
| 1497 |
'split_strategy': split_strategies,
|
| 1498 |
+
'chunk_size': [int(size.strip()) for size in chunk_sizes.split(',') if size.strip()],
|
| 1499 |
+
'overlap_size': [int(size.strip()) for size in overlap_sizes.split(',') if size.strip()],
|
| 1500 |
'vector_store_type': vector_store_types,
|
| 1501 |
'search_type': search_types,
|
| 1502 |
'top_k': [int(k.strip()) for k in top_k_values.split(',')],
|