Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -68,8 +68,9 @@ class EmbeddingModel:
|
|
| 68 |
self.model = HuggingFaceEmbeddings(model_name=model_name)
|
| 69 |
self.max_tokens = max_tokens
|
| 70 |
|
| 71 |
-
def embed(self,
|
| 72 |
-
|
|
|
|
| 73 |
|
| 74 |
def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, max_tokens=None):
|
| 75 |
# File processing
|
|
@@ -78,41 +79,44 @@ def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, m
|
|
| 78 |
file_path = os.path.join(FILES_DIR, file)
|
| 79 |
text += FileHandler.extract_text(file_path)
|
| 80 |
|
| 81 |
-
# Split text
|
| 82 |
if split_strategy == 'token':
|
| 83 |
splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
| 84 |
else:
|
| 85 |
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
| 86 |
|
| 87 |
chunks = splitter.split_text(text)
|
|
|
|
|
|
|
| 88 |
model = EmbeddingModel(MODELS[model_name], max_tokens=max_tokens)
|
| 89 |
-
embeddings = model.embed(
|
| 90 |
|
| 91 |
return embeddings, chunks
|
| 92 |
|
| 93 |
def search_embeddings(query, model_name, top_k):
|
| 94 |
model = HuggingFaceEmbeddings(model_name=MODELS[model_name])
|
| 95 |
embeddings = model.embed_query(query)
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
def calculate_statistics(embeddings):
|
| 99 |
# Return time taken, token count, etc.
|
| 100 |
return {"tokens": len(embeddings), "time_taken": time.time()}
|
| 101 |
|
| 102 |
import shutil
|
|
|
|
| 103 |
def upload_file(file, model_name, split_strategy, chunk_size, overlap_size, max_tokens, query, top_k):
|
| 104 |
# Ensure default values are set if None is passed
|
| 105 |
-
if chunk_size
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
chunk_size = int(chunk_size) # Convert chunk_size to int
|
| 113 |
-
overlap_size = int(overlap_size) # Convert overlap_size to int
|
| 114 |
-
except ValueError:
|
| 115 |
-
return {"error": "Chunk size and overlap size must be valid integers."}
|
| 116 |
|
| 117 |
# Process files and get embeddings
|
| 118 |
embeddings, chunks = process_files(model_name, split_strategy, chunk_size, overlap_size, max_tokens)
|
|
|
|
| 68 |
self.model = HuggingFaceEmbeddings(model_name=model_name)
|
| 69 |
self.max_tokens = max_tokens
|
| 70 |
|
| 71 |
+
def embed(self, chunks: List[str]):
|
| 72 |
+
# Embed the list of chunks
|
| 73 |
+
return self.model.embed_documents(chunks)
|
| 74 |
|
| 75 |
def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, max_tokens=None):
|
| 76 |
# File processing
|
|
|
|
| 79 |
file_path = os.path.join(FILES_DIR, file)
|
| 80 |
text += FileHandler.extract_text(file_path)
|
| 81 |
|
| 82 |
+
# Split text into chunks
|
| 83 |
if split_strategy == 'token':
|
| 84 |
splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
| 85 |
else:
|
| 86 |
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
| 87 |
|
| 88 |
chunks = splitter.split_text(text)
|
| 89 |
+
|
| 90 |
+
# Embed chunks, not the full text
|
| 91 |
model = EmbeddingModel(MODELS[model_name], max_tokens=max_tokens)
|
| 92 |
+
embeddings = model.embed(chunks)
|
| 93 |
|
| 94 |
return embeddings, chunks
|
| 95 |
|
| 96 |
def search_embeddings(query, model_name, top_k):
|
| 97 |
model = HuggingFaceEmbeddings(model_name=MODELS[model_name])
|
| 98 |
embeddings = model.embed_query(query)
|
| 99 |
+
|
| 100 |
+
# Perform FAISS or other similarity-based search over embeddings
|
| 101 |
+
# This part requires you to build and search a FAISS index with embeddings
|
| 102 |
+
|
| 103 |
+
return embeddings # You would likely return the top-k results here
|
| 104 |
|
| 105 |
def calculate_statistics(embeddings):
|
| 106 |
# Return time taken, token count, etc.
|
| 107 |
return {"tokens": len(embeddings), "time_taken": time.time()}
|
| 108 |
|
| 109 |
import shutil
|
| 110 |
+
|
| 111 |
def upload_file(file, model_name, split_strategy, chunk_size, overlap_size, max_tokens, query, top_k):
|
| 112 |
# Ensure default values are set if None is passed
|
| 113 |
+
chunk_size = int(chunk_size) if chunk_size else 100
|
| 114 |
+
overlap_size = int(overlap_size) if overlap_size else 0
|
| 115 |
+
|
| 116 |
+
# Save uploaded file
|
| 117 |
+
file_path = os.path.join(FILES_DIR, file.name)
|
| 118 |
+
with open(file_path, "wb") as f:
|
| 119 |
+
shutil.copyfileobj(file, f) # Copy the uploaded file content to the destination
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
# Process files and get embeddings
|
| 122 |
embeddings, chunks = process_files(model_name, split_strategy, chunk_size, overlap_size, max_tokens)
|