Commit
·
5b2f824
1
Parent(s):
48d391e
Removed langchain and llama-cpp-python (not actively supported anymore) dependencies. Updated packages. Updated default dataset
Browse files- Dockerfile +0 -1
- app.py +14 -13
- faiss_embedding/faiss_embedding.zip +2 -2
- requirements.txt +11 -13
- requirements_aws.txt +9 -11
- requirements_gpu.txt +12 -14
- tools/chatfuncs.py +68 -61
- tools/config.py +2 -2
- tools/document.py +16 -0
- tools/embeddings.py +24 -0
- tools/faiss_store.py +201 -0
- tools/ingest.py +11 -11
- tools/text_splitter.py +112 -0
Dockerfile
CHANGED
|
@@ -27,7 +27,6 @@ COPY requirements_aws.txt .
|
|
| 27 |
RUN pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
|
| 28 |
&& pip install --no-cache-dir --target=/install sentence-transformers==4.1.0 --no-deps \
|
| 29 |
&& pip install --no-cache-dir --target=/install span-marker==1.7.0 --no-deps \
|
| 30 |
-
&& pip install --no-cache-dir --target=/install langchain-huggingface==0.1.2 --no-deps \
|
| 31 |
&& pip install --no-cache-dir --target=/install keybert==0.9.0 --no-deps \
|
| 32 |
&& pip install --no-cache-dir --target=/install -r requirements_aws.txt
|
| 33 |
|
|
|
|
| 27 |
RUN pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
|
| 28 |
&& pip install --no-cache-dir --target=/install sentence-transformers==4.1.0 --no-deps \
|
| 29 |
&& pip install --no-cache-dir --target=/install span-marker==1.7.0 --no-deps \
|
|
|
|
| 30 |
&& pip install --no-cache-dir --target=/install keybert==0.9.0 --no-deps \
|
| 31 |
&& pip install --no-cache-dir --target=/install -r requirements_aws.txt
|
| 32 |
|
app.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
from typing import Type
|
| 3 |
-
|
| 4 |
-
from langchain_community.vectorstores import FAISS
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
| 7 |
from torch import float16, float32
|
| 8 |
-
|
| 9 |
from huggingface_hub import hf_hub_download
|
| 10 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
| 11 |
|
|
@@ -13,7 +12,7 @@ from tools.ingest import embed_faiss_save_to_zip, load_embeddings_model, get_fai
|
|
| 13 |
from tools.helper_functions import get_connection_params, reveal_feedback_buttons, wipe_logs
|
| 14 |
from tools.aws_functions import upload_file_to_s3
|
| 15 |
from tools.auth import authenticate_user
|
| 16 |
-
from tools.config import FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, HOST_NAME, COGNITO_AUTH, INPUT_FOLDER, OUTPUT_FOLDER, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, DEFAULT_EMBEDDINGS_LOCATION, EMBEDDINGS_MODEL_NAME, DEFAULT_DATA_SOURCE, HF_TOKEN, LARGE_MODEL_REPO_ID, LARGE_MODEL_GGUF_FILE, LARGE_MODEL_NAME, SMALL_MODEL_NAME, SMALL_MODEL_REPO_ID, DEFAULT_DATA_SOURCE_NAME, DEFAULT_EXAMPLES, DEFAULT_MODEL_CHOICES, RUN_GEMINI_MODELS, LOAD_LARGE_MODEL
|
| 17 |
from tools.model_load import torch_device, gpu_config, cpu_config, context_length
|
| 18 |
import tools.chatfuncs as chatf
|
| 19 |
import tools.ingest as ing
|
|
@@ -39,10 +38,11 @@ if isinstance(DEFAULT_MODEL_CHOICES, str): default_model_choices = eval(DEFAULT_
|
|
| 39 |
###
|
| 40 |
# Load in default embeddings and embeddings model name
|
| 41 |
embeddings_model = load_embeddings_model(EMBEDDINGS_MODEL_NAME)
|
| 42 |
-
vectorstore = get_faiss_store(zip_file_path=DEFAULT_EMBEDDINGS_LOCATION,embeddings_model=embeddings_model)#globals()["embeddings"])
|
|
|
|
| 43 |
|
| 44 |
chatf.embeddings = embeddings_model
|
| 45 |
-
chatf.vectorstore = vectorstore
|
| 46 |
|
| 47 |
def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings_model=embeddings_model):
|
| 48 |
|
|
@@ -64,17 +64,17 @@ def create_hf_model(model_name:str, hf_token=HF_TOKEN):
|
|
| 64 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
|
| 65 |
else:
|
| 66 |
if hf_token:
|
| 67 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=hf_token
|
| 68 |
else:
|
| 69 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto"
|
| 70 |
else:
|
| 71 |
if "flan" in model_name:
|
| 72 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
|
| 73 |
else:
|
| 74 |
if hf_token:
|
| 75 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token
|
| 76 |
else:
|
| 77 |
-
model = AutoModelForCausalLM.from_pretrained(model_name
|
| 78 |
|
| 79 |
if hf_token:
|
| 80 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = context_length, token=hf_token)
|
|
@@ -97,6 +97,7 @@ def load_model(model_type:str, gpu_layers:int, gpu_config:dict=gpu_config, cpu_c
|
|
| 97 |
print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.")
|
| 98 |
|
| 99 |
try:
|
|
|
|
| 100 |
model = Llama(
|
| 101 |
model_path=hf_hub_download(
|
| 102 |
repo_id=LARGE_MODEL_REPO_ID,
|
|
@@ -248,9 +249,9 @@ with app:
|
|
| 248 |
with gr.Column(scale=3):
|
| 249 |
model_choice = gr.Radio(label="Choose a chat model", value=SMALL_MODEL_NAME, choices = default_model_choices)
|
| 250 |
if RUN_GEMINI_MODELS == "1":
|
| 251 |
-
in_api_key = gr.Textbox(value =
|
| 252 |
else:
|
| 253 |
-
in_api_key = gr.Textbox(value =
|
| 254 |
with gr.Column(scale=1):
|
| 255 |
change_model_button = gr.Button(value="Load model")
|
| 256 |
|
|
@@ -264,7 +265,7 @@ with app:
|
|
| 264 |
load_text = gr.Text(label="Load status")
|
| 265 |
|
| 266 |
gr.HTML(
|
| 267 |
-
"<center>This app is powered by Gradio
|
| 268 |
)
|
| 269 |
|
| 270 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import Type
|
| 3 |
+
from tools.faiss_store import FAISS
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
from torch import float16, float32
|
| 7 |
+
|
| 8 |
from huggingface_hub import hf_hub_download
|
| 9 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
| 10 |
|
|
|
|
| 12 |
from tools.helper_functions import get_connection_params, reveal_feedback_buttons, wipe_logs
|
| 13 |
from tools.aws_functions import upload_file_to_s3
|
| 14 |
from tools.auth import authenticate_user
|
| 15 |
+
from tools.config import FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, HOST_NAME, COGNITO_AUTH, INPUT_FOLDER, OUTPUT_FOLDER, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, DEFAULT_EMBEDDINGS_LOCATION, EMBEDDINGS_MODEL_NAME, DEFAULT_DATA_SOURCE, HF_TOKEN, LARGE_MODEL_REPO_ID, LARGE_MODEL_GGUF_FILE, LARGE_MODEL_NAME, SMALL_MODEL_NAME, SMALL_MODEL_REPO_ID, DEFAULT_DATA_SOURCE_NAME, DEFAULT_EXAMPLES, DEFAULT_MODEL_CHOICES, RUN_GEMINI_MODELS, LOAD_LARGE_MODEL, GEMINI_API_KEY
|
| 16 |
from tools.model_load import torch_device, gpu_config, cpu_config, context_length
|
| 17 |
import tools.chatfuncs as chatf
|
| 18 |
import tools.ingest as ing
|
|
|
|
| 38 |
###
|
| 39 |
# Load in default embeddings and embeddings model name
|
| 40 |
embeddings_model = load_embeddings_model(EMBEDDINGS_MODEL_NAME)
|
| 41 |
+
# vectorstore = get_faiss_store(zip_file_path=DEFAULT_EMBEDDINGS_LOCATION,embeddings_model=embeddings_model)#globals()["embeddings"])
|
| 42 |
+
vectorstore = None
|
| 43 |
|
| 44 |
chatf.embeddings = embeddings_model
|
| 45 |
+
# chatf.vectorstore = vectorstore
|
| 46 |
|
| 47 |
def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings_model=embeddings_model):
|
| 48 |
|
|
|
|
| 64 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
|
| 65 |
else:
|
| 66 |
if hf_token:
|
| 67 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=hf_token) # , torch_dtype=float16 - not compatible with CPU and Gemma 3
|
| 68 |
else:
|
| 69 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") # , torch_dtype=float16
|
| 70 |
else:
|
| 71 |
if "flan" in model_name:
|
| 72 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
|
| 73 |
else:
|
| 74 |
if hf_token:
|
| 75 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token) # , torch_dtype=float16
|
| 76 |
else:
|
| 77 |
+
model = AutoModelForCausalLM.from_pretrained(model_name) # , torch_dtype=float16
|
| 78 |
|
| 79 |
if hf_token:
|
| 80 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = context_length, token=hf_token)
|
|
|
|
| 97 |
print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.")
|
| 98 |
|
| 99 |
try:
|
| 100 |
+
from llama_cpp import Llama
|
| 101 |
model = Llama(
|
| 102 |
model_path=hf_hub_download(
|
| 103 |
repo_id=LARGE_MODEL_REPO_ID,
|
|
|
|
| 249 |
with gr.Column(scale=3):
|
| 250 |
model_choice = gr.Radio(label="Choose a chat model", value=SMALL_MODEL_NAME, choices = default_model_choices)
|
| 251 |
if RUN_GEMINI_MODELS == "1":
|
| 252 |
+
in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password",interactive=True, visible=True)
|
| 253 |
else:
|
| 254 |
+
in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password",interactive=True, visible=False)
|
| 255 |
with gr.Column(scale=1):
|
| 256 |
change_model_button = gr.Button(value="Load model")
|
| 257 |
|
|
|
|
| 265 |
load_text = gr.Text(label="Load status")
|
| 266 |
|
| 267 |
gr.HTML(
|
| 268 |
+
"<center>This app is powered by Gradio and Transformers.</center>"
|
| 269 |
)
|
| 270 |
|
| 271 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
faiss_embedding/faiss_embedding.zip
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e9d58ea966d7fb5bf05c6d13217ab0a4f15c18607976b3ed443b6bd163b390e
|
| 3 |
+
size 293425
|
requirements.txt
CHANGED
|
@@ -1,26 +1,24 @@
|
|
| 1 |
-
langchain==0.3.24
|
| 2 |
-
langchain-huggingface==0.1.2
|
| 3 |
-
langchain-community==0.3.22
|
| 4 |
beautifulsoup4==4.13.4
|
| 5 |
-
google-
|
| 6 |
pandas==2.2.3
|
| 7 |
-
|
|
|
|
| 8 |
# For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
|
| 9 |
-
llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
|
| 10 |
#llama-cpp-python==0.3.9 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" # CPU
|
| 11 |
#llama-cpp-python==0.3.9 -C cmake.args="-DGGML_CUDA=on" # With CUDA
|
| 12 |
-
torch
|
| 13 |
-
sentence_transformers==
|
| 14 |
faiss-cpu==1.10.0
|
| 15 |
pypdf==5.4.0
|
| 16 |
python-docx==1.1.2
|
| 17 |
keybert==0.9.0
|
| 18 |
span-marker==1.7.0
|
| 19 |
-
gradio==5.
|
| 20 |
nltk==3.9.1
|
| 21 |
-
bm25s==0.2.
|
| 22 |
PyStemmer==2.2.0.3
|
| 23 |
-
scipy==1.
|
| 24 |
-
numpy==
|
| 25 |
-
boto3==1.
|
| 26 |
python-dotenv==1.1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
beautifulsoup4==4.13.4
|
| 2 |
+
google-genai==1.50.0
|
| 3 |
pandas==2.2.3
|
| 4 |
+
markdown==3.8.1
|
| 5 |
+
transformers==4.57.1
|
| 6 |
# For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
|
| 7 |
+
# llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
|
| 8 |
#llama-cpp-python==0.3.9 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" # CPU
|
| 9 |
#llama-cpp-python==0.3.9 -C cmake.args="-DGGML_CUDA=on" # With CUDA
|
| 10 |
+
torch>=2.6.0 --extra-index-url https://download.pytorch.org/whl/cpu
|
| 11 |
+
sentence_transformers==5.1.2
|
| 12 |
faiss-cpu==1.10.0
|
| 13 |
pypdf==5.4.0
|
| 14 |
python-docx==1.1.2
|
| 15 |
keybert==0.9.0
|
| 16 |
span-marker==1.7.0
|
| 17 |
+
gradio==5.49.1
|
| 18 |
nltk==3.9.1
|
| 19 |
+
bm25s==0.2.14
|
| 20 |
PyStemmer==2.2.0.3
|
| 21 |
+
scipy==1.16.3
|
| 22 |
+
numpy==2.2.6
|
| 23 |
+
boto3==1.40.72
|
| 24 |
python-dotenv==1.1.0
|
requirements_aws.txt
CHANGED
|
@@ -1,15 +1,13 @@
|
|
| 1 |
-
#langchain==0.3.24
|
| 2 |
-
#langchain-huggingface==0.1.2 # Loaded in Dockerfile
|
| 3 |
boto3==1.38.0
|
| 4 |
python-dotenv==1.1.0
|
| 5 |
-
langchain-community==0.3.22
|
| 6 |
beautifulsoup4==4.13.4
|
| 7 |
-
google-
|
| 8 |
pandas==2.2.3
|
| 9 |
-
|
|
|
|
| 10 |
# For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
|
| 11 |
#llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
|
| 12 |
-
llama-cpp-python==0.3.
|
| 13 |
#torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu # Loaded in Dockerfile
|
| 14 |
#sentence_transformers==4.1.0 # Loaded in Dockerfile
|
| 15 |
faiss-cpu==1.10.0
|
|
@@ -17,11 +15,11 @@ pypdf==5.4.0
|
|
| 17 |
python-docx==1.1.2
|
| 18 |
#keybert==0.9.0 # Loaded in Dockerfile
|
| 19 |
#span-marker==1.7.0 # Loaded in Dockerfile
|
| 20 |
-
gradio==5.
|
| 21 |
nltk==3.9.1
|
| 22 |
-
bm25s==0.2.
|
| 23 |
PyStemmer==2.2.0.3
|
| 24 |
-
scikit-learn==1.
|
| 25 |
-
scipy==1.
|
| 26 |
-
numpy==
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 1 |
boto3==1.38.0
|
| 2 |
python-dotenv==1.1.0
|
|
|
|
| 3 |
beautifulsoup4==4.13.4
|
| 4 |
+
google-genai==1.50.0
|
| 5 |
pandas==2.2.3
|
| 6 |
+
markdown==3.8.1
|
| 7 |
+
transformers==4.57.1
|
| 8 |
# For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
|
| 9 |
#llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
|
| 10 |
+
# llama-cpp-python==0.3.16 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
|
| 11 |
#torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu # Loaded in Dockerfile
|
| 12 |
#sentence_transformers==4.1.0 # Loaded in Dockerfile
|
| 13 |
faiss-cpu==1.10.0
|
|
|
|
| 15 |
python-docx==1.1.2
|
| 16 |
#keybert==0.9.0 # Loaded in Dockerfile
|
| 17 |
#span-marker==1.7.0 # Loaded in Dockerfile
|
| 18 |
+
gradio==5.49.1
|
| 19 |
nltk==3.9.1
|
| 20 |
+
bm25s==0.2.14
|
| 21 |
PyStemmer==2.2.0.3
|
| 22 |
+
scikit-learn==1.7.2
|
| 23 |
+
scipy==1.16.3
|
| 24 |
+
numpy==2.2.6
|
| 25 |
|
requirements_gpu.txt
CHANGED
|
@@ -1,24 +1,22 @@
|
|
| 1 |
-
#langchain==0.3.24
|
| 2 |
-
langchain-community==0.3.22
|
| 3 |
-
langchain-huggingface==0.1.2
|
| 4 |
beautifulsoup4==4.13.4
|
| 5 |
-
google-
|
| 6 |
pandas==2.2.3
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
llama-cpp-python==0.3.
|
| 11 |
-
|
|
|
|
| 12 |
faiss-cpu==1.10.0
|
| 13 |
pypdf==5.4.0
|
| 14 |
python-docx==1.1.2
|
| 15 |
keybert==0.9.0
|
| 16 |
span-marker==1.7.0
|
| 17 |
-
gradio==5.
|
| 18 |
nltk==3.9.1
|
| 19 |
-
bm25s==0.2.
|
| 20 |
PyStemmer==2.2.0.3
|
| 21 |
-
scipy==1.
|
| 22 |
-
numpy==
|
| 23 |
-
boto3==1.
|
| 24 |
python-dotenv==1.1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
beautifulsoup4==4.13.4
|
| 2 |
+
google-genai==1.50.0
|
| 3 |
pandas==2.2.3
|
| 4 |
+
markdown==3.8.1
|
| 5 |
+
transformers==4.57.1
|
| 6 |
+
torch>=2.6.0 --extra-index-url https://download.pytorch.org/whl/cu126
|
| 7 |
+
#llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
|
| 8 |
+
# llama-cpp-python==0.3.16 -C cmake.args="-DGGML_CUDA=on"
|
| 9 |
+
sentence_transformers==5.1.2
|
| 10 |
faiss-cpu==1.10.0
|
| 11 |
pypdf==5.4.0
|
| 12 |
python-docx==1.1.2
|
| 13 |
keybert==0.9.0
|
| 14 |
span-marker==1.7.0
|
| 15 |
+
gradio==5.49.1
|
| 16 |
nltk==3.9.1
|
| 17 |
+
bm25s==0.2.14
|
| 18 |
PyStemmer==2.2.0.3
|
| 19 |
+
scipy==1.16.3
|
| 20 |
+
numpy==2.2.6
|
| 21 |
+
boto3==1.40.72
|
| 22 |
python-dotenv==1.1.0
|
tools/chatfuncs.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
import re
|
| 2 |
import os
|
| 3 |
import datetime
|
| 4 |
-
from typing import Type, Dict, List, Tuple
|
| 5 |
import time
|
| 6 |
from itertools import compress
|
| 7 |
import pandas as pd
|
| 8 |
-
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
from gradio import Progress
|
| 11 |
import boto3
|
|
@@ -14,7 +15,10 @@ from nltk.corpus import stopwords
|
|
| 14 |
from nltk.tokenize import RegexpTokenizer
|
| 15 |
from nltk.stem import WordNetLemmatizer
|
| 16 |
from keybert import KeyBERT
|
| 17 |
-
from
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# For Name Entity Recognition model
|
| 20 |
#from span_marker import SpanMarkerModel # Not currently used
|
|
@@ -26,12 +30,6 @@ import Stemmer
|
|
| 26 |
import torch.cuda
|
| 27 |
from threading import Thread
|
| 28 |
from transformers import pipeline, TextIteratorStreamer
|
| 29 |
-
# Langchain functions
|
| 30 |
-
from langchain.prompts import PromptTemplate
|
| 31 |
-
from langchain_community.vectorstores import FAISS
|
| 32 |
-
from langchain_community.retrievers import SVMRetriever
|
| 33 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 34 |
-
from langchain.docstore.document import Document
|
| 35 |
|
| 36 |
from tools.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3, instruction_prompt_qwen, instruction_prompt_template_orca, instruction_prompt_gemma, instruction_prompt_template_gemini_aws
|
| 37 |
from tools.model_load import temperature, max_new_tokens, sample, repetition_penalty, top_p, top_k, torch_device, CtransGenGenerationConfig, max_tokens
|
|
@@ -79,26 +77,19 @@ kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniL
|
|
| 79 |
|
| 80 |
def base_prompt_templates(model_type:str = SMALL_MODEL_NAME):
|
| 81 |
|
| 82 |
-
#
|
| 83 |
-
|
| 84 |
-
# input_variables=["page_content", "source"],
|
| 85 |
-
#)
|
| 86 |
-
|
| 87 |
-
CONTENT_PROMPT = PromptTemplate(
|
| 88 |
-
template="{page_content}\n\n",#\n\nSOURCE: {source}\n\n",
|
| 89 |
-
input_variables=["page_content"]
|
| 90 |
-
)
|
| 91 |
|
| 92 |
# The main prompt:
|
| 93 |
|
| 94 |
if model_type == SMALL_MODEL_NAME:
|
| 95 |
-
|
| 96 |
elif model_type == LARGE_MODEL_NAME:
|
| 97 |
-
|
| 98 |
else:
|
| 99 |
-
|
| 100 |
|
| 101 |
-
return
|
| 102 |
|
| 103 |
def write_out_metadata_as_string(metadata_in:str):
|
| 104 |
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
|
|
@@ -175,7 +166,7 @@ def generate_expanded_prompt(
|
|
| 175 |
|
| 176 |
sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
|
| 177 |
|
| 178 |
-
instruction_prompt_out = instruction_prompt.
|
| 179 |
|
| 180 |
return instruction_prompt_out, sources_docs_content_string, new_question_kworded
|
| 181 |
|
|
@@ -269,9 +260,9 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
|
|
| 269 |
|
| 270 |
return response
|
| 271 |
|
| 272 |
-
def construct_gemini_generative_model(in_api_key: str, temperature: float, model_choice: str, system_prompt: str, max_tokens: int) -> Tuple[object, dict]:
|
| 273 |
"""
|
| 274 |
-
Constructs a
|
| 275 |
|
| 276 |
Parameters:
|
| 277 |
- in_api_key (str): The API key for authentication.
|
|
@@ -279,34 +270,37 @@ def construct_gemini_generative_model(in_api_key: str, temperature: float, model
|
|
| 279 |
- model_choice (str): The choice of model to use for generation.
|
| 280 |
- system_prompt (str): The system prompt to guide the generation.
|
| 281 |
- max_tokens (int): The maximum number of tokens to generate.
|
|
|
|
| 282 |
|
| 283 |
Returns:
|
| 284 |
-
- Tuple[object, dict]: A tuple containing the constructed
|
| 285 |
"""
|
| 286 |
-
# Construct a
|
| 287 |
try:
|
| 288 |
if in_api_key:
|
| 289 |
#print("Getting API key from textbox")
|
| 290 |
api_key = in_api_key
|
| 291 |
-
ai.
|
| 292 |
elif "GOOGLE_API_KEY" in os.environ:
|
| 293 |
#print("Searching for API key in environmental variables")
|
| 294 |
api_key = os.environ["GOOGLE_API_KEY"]
|
| 295 |
-
ai.
|
| 296 |
else:
|
| 297 |
-
print("No API key
|
| 298 |
raise gr.Error("No API key found.")
|
| 299 |
except Exception as e:
|
| 300 |
print(e)
|
|
|
|
| 301 |
|
| 302 |
-
config
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
|
| 304 |
print("model_choice:", model_choice)
|
| 305 |
-
|
| 306 |
-
#model = ai.GenerativeModel.from_cached_content(cached_content=cache, generation_config=config)
|
| 307 |
-
model = ai.GenerativeModel(model_name=model_choice, system_instruction=system_prompt, generation_config=config)
|
| 308 |
|
| 309 |
-
return
|
| 310 |
|
| 311 |
# Function to send a request and update history
|
| 312 |
def send_request(prompt: str, conversation_history: List[dict], model: object, config: dict, model_choice: str, system_prompt: str, temperature: float, progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
|
|
@@ -333,7 +327,15 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
| 333 |
# Generate the model's response
|
| 334 |
if "gemini" in model_choice:
|
| 335 |
try:
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
except Exception as e:
|
| 338 |
# If fails, try again after 10 seconds in case there is a throttle limit
|
| 339 |
print(e)
|
|
@@ -343,7 +345,14 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
| 343 |
print(out_message)
|
| 344 |
progress(0.5, desc=out_message)
|
| 345 |
time.sleep(30)
|
| 346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
except Exception as e:
|
| 348 |
print(e)
|
| 349 |
return "", conversation_history
|
|
@@ -559,7 +568,7 @@ def produce_streaming_answer_chatbot(
|
|
| 559 |
history.append({"metadata":None, "options":None, "role": "assistant", "content": ''})
|
| 560 |
|
| 561 |
for char in clean_response_text:
|
| 562 |
-
time.sleep(0.
|
| 563 |
history[-1]['content'] += char
|
| 564 |
yield history
|
| 565 |
|
|
@@ -594,7 +603,7 @@ def produce_streaming_answer_chatbot(
|
|
| 594 |
history.append({"metadata":None, "options":None, "role": "assistant", "content": ''})
|
| 595 |
|
| 596 |
for char in clean_response_text:
|
| 597 |
-
time.sleep(0.
|
| 598 |
history[-1]['content'] += char
|
| 599 |
yield history
|
| 600 |
|
|
@@ -795,31 +804,29 @@ def hybrid_retrieval(
|
|
| 795 |
|
| 796 |
|
| 797 |
# 3rd level check on retrieved docs with SVM retriever
|
| 798 |
-
#
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
#hf_embeddings = HuggingFaceEmbeddings(**embeddings)
|
| 803 |
-
hf_embeddings = embeddings_model
|
| 804 |
-
|
| 805 |
-
svm_retriever = SVMRetriever.from_texts(content_keep, hf_embeddings, k = k_val)
|
| 806 |
-
svm_result = svm_retriever.invoke(new_question_kworded)
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
svm_rank=[]
|
| 810 |
svm_score = []
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 819 |
|
| 820 |
|
| 821 |
-
## Calculate final score based on
|
| 822 |
-
|
|
|
|
|
|
|
| 823 |
final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
|
| 824 |
# Force final_rank to increment by 1 each time
|
| 825 |
final_rank = list(pd.Series(final_rank).rank(method='first'))
|
|
|
|
| 1 |
import re
|
| 2 |
import os
|
| 3 |
import datetime
|
| 4 |
+
from typing import Type, Dict, List, Tuple, Union
|
| 5 |
import time
|
| 6 |
from itertools import compress
|
| 7 |
import pandas as pd
|
| 8 |
+
from google import genai as ai
|
| 9 |
+
from google.genai import types
|
| 10 |
import gradio as gr
|
| 11 |
from gradio import Progress
|
| 12 |
import boto3
|
|
|
|
| 15 |
from nltk.tokenize import RegexpTokenizer
|
| 16 |
from nltk.stem import WordNetLemmatizer
|
| 17 |
from keybert import KeyBERT
|
| 18 |
+
from tools.embeddings import HuggingFaceEmbeddings
|
| 19 |
+
from tools.faiss_store import FAISS
|
| 20 |
+
from tools.text_splitter import RecursiveCharacterTextSplitter
|
| 21 |
+
from tools.document import Document
|
| 22 |
|
| 23 |
# For Name Entity Recognition model
|
| 24 |
#from span_marker import SpanMarkerModel # Not currently used
|
|
|
|
| 30 |
import torch.cuda
|
| 31 |
from threading import Thread
|
| 32 |
from transformers import pipeline, TextIteratorStreamer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
from tools.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3, instruction_prompt_qwen, instruction_prompt_template_orca, instruction_prompt_gemma, instruction_prompt_template_gemini_aws
|
| 35 |
from tools.model_load import temperature, max_new_tokens, sample, repetition_penalty, top_p, top_k, torch_device, CtransGenGenerationConfig, max_tokens
|
|
|
|
| 77 |
|
| 78 |
def base_prompt_templates(model_type:str = SMALL_MODEL_NAME):
|
| 79 |
|
| 80 |
+
# Simple string template for content
|
| 81 |
+
CONTENT_PROMPT_TEMPLATE = "{page_content}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
# The main prompt:
|
| 84 |
|
| 85 |
if model_type == SMALL_MODEL_NAME:
|
| 86 |
+
INSTRUCTION_PROMPT_TEMPLATE = instruction_prompt_gemma
|
| 87 |
elif model_type == LARGE_MODEL_NAME:
|
| 88 |
+
INSTRUCTION_PROMPT_TEMPLATE = instruction_prompt_phi3
|
| 89 |
else:
|
| 90 |
+
INSTRUCTION_PROMPT_TEMPLATE = instruction_prompt_template_gemini_aws
|
| 91 |
|
| 92 |
+
return INSTRUCTION_PROMPT_TEMPLATE, CONTENT_PROMPT_TEMPLATE
|
| 93 |
|
| 94 |
def write_out_metadata_as_string(metadata_in:str):
|
| 95 |
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
|
|
|
|
| 166 |
|
| 167 |
sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
|
| 168 |
|
| 169 |
+
instruction_prompt_out = instruction_prompt.replace('{question}', new_question_kworded).replace('{summaries}', docs_content_string)
|
| 170 |
|
| 171 |
return instruction_prompt_out, sources_docs_content_string, new_question_kworded
|
| 172 |
|
|
|
|
| 260 |
|
| 261 |
return response
|
| 262 |
|
| 263 |
+
def construct_gemini_generative_model(in_api_key: str, temperature: float, model_choice: str, system_prompt: str, max_tokens: int, random_seed: int = None) -> Tuple[object, dict]:
|
| 264 |
"""
|
| 265 |
+
Constructs a Client for Gemini API calls using the new google.genai package.
|
| 266 |
|
| 267 |
Parameters:
|
| 268 |
- in_api_key (str): The API key for authentication.
|
|
|
|
| 270 |
- model_choice (str): The choice of model to use for generation.
|
| 271 |
- system_prompt (str): The system prompt to guide the generation.
|
| 272 |
- max_tokens (int): The maximum number of tokens to generate.
|
| 273 |
+
- random_seed (int, optional): Random seed for reproducibility.
|
| 274 |
|
| 275 |
Returns:
|
| 276 |
+
- Tuple[object, dict]: A tuple containing the constructed Client and its configuration.
|
| 277 |
"""
|
| 278 |
+
# Construct a Client for the new API
|
| 279 |
try:
|
| 280 |
if in_api_key:
|
| 281 |
#print("Getting API key from textbox")
|
| 282 |
api_key = in_api_key
|
| 283 |
+
client = ai.Client(api_key=api_key)
|
| 284 |
elif "GOOGLE_API_KEY" in os.environ:
|
| 285 |
#print("Searching for API key in environmental variables")
|
| 286 |
api_key = os.environ["GOOGLE_API_KEY"]
|
| 287 |
+
client = ai.Client(api_key=api_key)
|
| 288 |
else:
|
| 289 |
+
print("No API key found")
|
| 290 |
raise gr.Error("No API key found.")
|
| 291 |
except Exception as e:
|
| 292 |
print(e)
|
| 293 |
+
raise
|
| 294 |
|
| 295 |
+
# Create config with optional random_seed
|
| 296 |
+
config_kwargs = {"temperature": temperature, "max_output_tokens": max_tokens}
|
| 297 |
+
if random_seed is not None:
|
| 298 |
+
config_kwargs["seed"] = random_seed
|
| 299 |
+
config = types.GenerateContentConfig(**config_kwargs)
|
| 300 |
|
| 301 |
print("model_choice:", model_choice)
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
+
return client, config
|
| 304 |
|
| 305 |
# Function to send a request and update history
|
| 306 |
def send_request(prompt: str, conversation_history: List[dict], model: object, config: dict, model_choice: str, system_prompt: str, temperature: float, progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
|
|
|
|
| 327 |
# Generate the model's response
|
| 328 |
if "gemini" in model_choice:
|
| 329 |
try:
|
| 330 |
+
# New API: client.models.generate_content instead of model.generate_content
|
| 331 |
+
gemini_response = model.models.generate_content(model=model_choice, contents=full_prompt, config=config)
|
| 332 |
+
# Wrap response in ResponseObject for backwards compatibility
|
| 333 |
+
usage_metadata = {}
|
| 334 |
+
if hasattr(gemini_response, 'usage_metadata'):
|
| 335 |
+
usage_metadata = gemini_response.usage_metadata
|
| 336 |
+
elif hasattr(gemini_response, 'usage'):
|
| 337 |
+
usage_metadata = gemini_response.usage
|
| 338 |
+
response = ResponseObject(text=gemini_response.text, usage_metadata=usage_metadata)
|
| 339 |
except Exception as e:
|
| 340 |
# If fails, try again after 10 seconds in case there is a throttle limit
|
| 341 |
print(e)
|
|
|
|
| 345 |
print(out_message)
|
| 346 |
progress(0.5, desc=out_message)
|
| 347 |
time.sleep(30)
|
| 348 |
+
gemini_response = model.models.generate_content(model=model_choice, contents=full_prompt, config=config)
|
| 349 |
+
# Wrap response in ResponseObject for backwards compatibility
|
| 350 |
+
usage_metadata = {}
|
| 351 |
+
if hasattr(gemini_response, 'usage_metadata'):
|
| 352 |
+
usage_metadata = gemini_response.usage_metadata
|
| 353 |
+
elif hasattr(gemini_response, 'usage'):
|
| 354 |
+
usage_metadata = gemini_response.usage
|
| 355 |
+
response = ResponseObject(text=gemini_response.text, usage_metadata=usage_metadata)
|
| 356 |
except Exception as e:
|
| 357 |
print(e)
|
| 358 |
return "", conversation_history
|
|
|
|
| 568 |
history.append({"metadata":None, "options":None, "role": "assistant", "content": ''})
|
| 569 |
|
| 570 |
for char in clean_response_text:
|
| 571 |
+
time.sleep(0.001)
|
| 572 |
history[-1]['content'] += char
|
| 573 |
yield history
|
| 574 |
|
|
|
|
| 603 |
history.append({"metadata":None, "options":None, "role": "assistant", "content": ''})
|
| 604 |
|
| 605 |
for char in clean_response_text:
|
| 606 |
+
time.sleep(0.001)
|
| 607 |
history[-1]['content'] += char
|
| 608 |
yield history
|
| 609 |
|
|
|
|
| 804 |
|
| 805 |
|
| 806 |
# 3rd level check on retrieved docs with SVM retriever
|
| 807 |
+
# Note: SVM retriever removed - using vector similarity only
|
| 808 |
+
# If svm_weight > 0, we'll use a simple ranking based on vector similarity
|
| 809 |
+
svm_rank = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 810 |
svm_score = []
|
| 811 |
+
|
| 812 |
+
if svm_weight > 0:
|
| 813 |
+
# Use vector similarity ranking as a proxy for SVM ranking
|
| 814 |
+
# This maintains the same interface but uses vector scores
|
| 815 |
+
for i, vec_item in enumerate(docs_keep):
|
| 816 |
+
# Use inverse rank (lower rank = higher score)
|
| 817 |
+
rank = i + 1
|
| 818 |
+
svm_rank.append(rank)
|
| 819 |
+
svm_score.append((docs_keep_length/rank)*svm_weight)
|
| 820 |
+
else:
|
| 821 |
+
# If svm_weight is 0, set all scores to 0
|
| 822 |
+
svm_rank = [0] * docs_keep_length
|
| 823 |
+
svm_score = [0.0] * docs_keep_length
|
| 824 |
|
| 825 |
|
| 826 |
+
## Calculate final score based on ranking methods (vector, BM25, and optionally SVM)
|
| 827 |
+
# Ensure all lists have the same length
|
| 828 |
+
min_len = min(len(vec_score), len(bm25_score), len(svm_score))
|
| 829 |
+
final_score = [a + b + c for a, b, c in zip(vec_score[:min_len], bm25_score[:min_len], svm_score[:min_len])]
|
| 830 |
final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
|
| 831 |
# Force final_rank to increment by 1 each time
|
| 832 |
final_rank = list(pd.Series(final_rank).rank(method='first'))
|
tools/config.py
CHANGED
|
@@ -189,9 +189,9 @@ LOAD_LARGE_MODEL = get_or_create_env_var("LOAD_LARGE_MODEL", '0')
|
|
| 189 |
|
| 190 |
LARGE_MODEL_NAME = get_or_create_env_var("LARGE_MODEL_NAME", "Phi 3.5 Mini (larger, slow)")
|
| 191 |
|
| 192 |
-
LARGE_MODEL_REPO_ID = get_or_create_env_var("LARGE_MODEL_REPO_ID", "QuantFactory/Phi-3.5-mini-instruct-GGUF") #
|
| 193 |
|
| 194 |
-
LARGE_MODEL_GGUF_FILE = get_or_create_env_var("LARGE_MODEL_GGUF_FILE", "Phi-3.5-mini-instruct.Q4_K_M.gguf")
|
| 195 |
|
| 196 |
# Build up options for models
|
| 197 |
default_model_choices = [SMALL_MODEL_NAME]
|
|
|
|
| 189 |
|
| 190 |
LARGE_MODEL_NAME = get_or_create_env_var("LARGE_MODEL_NAME", "Phi 3.5 Mini (larger, slow)")
|
| 191 |
|
| 192 |
+
LARGE_MODEL_REPO_ID = get_or_create_env_var("LARGE_MODEL_REPO_ID", "QuantFactory/Phi-3.5-mini-instruct-GGUF") # THIS METHOD IS DEPRECATED AND WILL NO LONGER BE USED IN FUTURE (Llama-cpp-python is no longer being updated)
|
| 193 |
|
| 194 |
+
LARGE_MODEL_GGUF_FILE = get_or_create_env_var("LARGE_MODEL_GGUF_FILE", "Phi-3.5-mini-instruct.Q4_K_M.gguf")
|
| 195 |
|
| 196 |
# Build up options for models
|
| 197 |
default_model_choices = [SMALL_MODEL_NAME]
|
tools/document.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Custom Document class to replace langchain Document.
|
| 3 |
+
"""
|
| 4 |
+
from typing import Dict, Any, Optional
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Document:
|
| 8 |
+
"""A simple document class with page_content and metadata."""
|
| 9 |
+
|
| 10 |
+
def __init__(self, page_content: str, metadata: Optional[Dict[str, Any]] = None):
|
| 11 |
+
self.page_content = page_content
|
| 12 |
+
self.metadata = metadata if metadata is not None else {}
|
| 13 |
+
|
| 14 |
+
def __repr__(self):
|
| 15 |
+
return f"Document(page_content='{self.page_content[:50]}...', metadata={self.metadata})"
|
| 16 |
+
|
tools/embeddings.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Custom embeddings wrapper using sentence-transformers to replace langchain HuggingFaceEmbeddings.
|
| 3 |
+
"""
|
| 4 |
+
from typing import List, Union
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class HuggingFaceEmbeddings:
|
| 9 |
+
"""Wrapper around SentenceTransformer to match langchain interface."""
|
| 10 |
+
|
| 11 |
+
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", **kwargs):
|
| 12 |
+
self.model_name = model_name
|
| 13 |
+
self.model = SentenceTransformer(model_name, **kwargs)
|
| 14 |
+
|
| 15 |
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
| 16 |
+
"""Embed a list of documents."""
|
| 17 |
+
embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
| 18 |
+
return embeddings.tolist()
|
| 19 |
+
|
| 20 |
+
def embed_query(self, text: str) -> List[float]:
|
| 21 |
+
"""Embed a single query."""
|
| 22 |
+
embedding = self.model.encode([text], convert_to_numpy=True, show_progress_bar=False)
|
| 23 |
+
return embedding[0].tolist()
|
| 24 |
+
|
tools/faiss_store.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Custom FAISS vectorstore to replace langchain FAISS.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import pickle
|
| 6 |
+
import tempfile
|
| 7 |
+
import zipfile
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import List, Tuple, Optional, Dict, Any
|
| 10 |
+
import numpy as np
|
| 11 |
+
import faiss
|
| 12 |
+
from uuid import uuid4
|
| 13 |
+
|
| 14 |
+
from tools.document import Document
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class InMemoryDocstore:
|
| 18 |
+
"""Simple in-memory document store."""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self._dict: Dict[str, Document] = {}
|
| 22 |
+
|
| 23 |
+
def add(self, mapping: Dict[str, Document]):
|
| 24 |
+
"""Add documents to the store."""
|
| 25 |
+
if not isinstance(self._dict, dict):
|
| 26 |
+
# Ensure _dict is a dictionary
|
| 27 |
+
if hasattr(self._dict, '_dict'):
|
| 28 |
+
self._dict = self._dict._dict
|
| 29 |
+
else:
|
| 30 |
+
self._dict = {}
|
| 31 |
+
self._dict.update(mapping)
|
| 32 |
+
|
| 33 |
+
def get(self, key: str) -> Optional[Document]:
|
| 34 |
+
"""Get a document by key."""
|
| 35 |
+
if not isinstance(self._dict, dict):
|
| 36 |
+
# Ensure _dict is a dictionary
|
| 37 |
+
if hasattr(self._dict, '_dict'):
|
| 38 |
+
self._dict = self._dict._dict
|
| 39 |
+
else:
|
| 40 |
+
self._dict = {}
|
| 41 |
+
return self._dict.get(key)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class FAISS:
|
| 45 |
+
"""FAISS vectorstore wrapper."""
|
| 46 |
+
|
| 47 |
+
def __init__(
|
| 48 |
+
self,
|
| 49 |
+
embedding_function,
|
| 50 |
+
index: Optional[faiss.Index] = None,
|
| 51 |
+
docstore: Optional[InMemoryDocstore] = None,
|
| 52 |
+
index_to_docstore_id: Optional[Dict[int, str]] = None
|
| 53 |
+
):
|
| 54 |
+
self.embedding_function = embedding_function
|
| 55 |
+
self.index = index
|
| 56 |
+
self.docstore = docstore if docstore else InMemoryDocstore()
|
| 57 |
+
self.index_to_docstore_id = index_to_docstore_id if index_to_docstore_id else {}
|
| 58 |
+
|
| 59 |
+
@classmethod
|
| 60 |
+
def from_documents(
|
| 61 |
+
cls,
|
| 62 |
+
documents: List[Document],
|
| 63 |
+
embedding
|
| 64 |
+
) -> "FAISS":
|
| 65 |
+
"""Create a FAISS index from documents."""
|
| 66 |
+
if not documents:
|
| 67 |
+
raise ValueError("No documents provided")
|
| 68 |
+
|
| 69 |
+
# Generate embeddings
|
| 70 |
+
texts = [doc.page_content for doc in documents]
|
| 71 |
+
embeddings = embedding.embed_documents(texts)
|
| 72 |
+
embeddings_np = np.array(embeddings).astype("float32")
|
| 73 |
+
|
| 74 |
+
# Create FAISS index
|
| 75 |
+
dimension = embeddings_np.shape[1]
|
| 76 |
+
index = faiss.IndexFlatIP(dimension)
|
| 77 |
+
index.add(embeddings_np)
|
| 78 |
+
|
| 79 |
+
# Create docstore
|
| 80 |
+
docstore = InMemoryDocstore()
|
| 81 |
+
index_to_docstore_id = {}
|
| 82 |
+
|
| 83 |
+
for i, doc in enumerate(documents):
|
| 84 |
+
doc_id = str(uuid4())
|
| 85 |
+
docstore.add({doc_id: doc})
|
| 86 |
+
index_to_docstore_id[i] = doc_id
|
| 87 |
+
|
| 88 |
+
return cls(
|
| 89 |
+
embedding_function=embedding.embed_query,
|
| 90 |
+
index=index,
|
| 91 |
+
docstore=docstore,
|
| 92 |
+
index_to_docstore_id=index_to_docstore_id
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
def similarity_search_with_score(
|
| 96 |
+
self,
|
| 97 |
+
query: str,
|
| 98 |
+
k: int = 4
|
| 99 |
+
) -> List[Tuple[Document, float]]:
|
| 100 |
+
"""Search for similar documents with scores."""
|
| 101 |
+
if self.index is None:
|
| 102 |
+
return []
|
| 103 |
+
|
| 104 |
+
# Get query embedding
|
| 105 |
+
query_embedding = self.embedding_function(query)
|
| 106 |
+
query_vector = np.array([query_embedding]).astype("float32")
|
| 107 |
+
|
| 108 |
+
# Search
|
| 109 |
+
scores, indices = self.index.search(query_vector, k)
|
| 110 |
+
|
| 111 |
+
results = []
|
| 112 |
+
for score, idx in zip(scores[0], indices[0]):
|
| 113 |
+
if idx < 0: # FAISS returns -1 for invalid indices
|
| 114 |
+
continue
|
| 115 |
+
doc_id = self.index_to_docstore_id.get(idx)
|
| 116 |
+
if doc_id:
|
| 117 |
+
doc = self.docstore.get(doc_id)
|
| 118 |
+
if doc:
|
| 119 |
+
results.append((doc, float(score)))
|
| 120 |
+
|
| 121 |
+
return results
|
| 122 |
+
|
| 123 |
+
def save_local(self, folder_path: str):
|
| 124 |
+
"""Save the FAISS index and docstore to disk."""
|
| 125 |
+
folder = Path(folder_path)
|
| 126 |
+
folder.mkdir(parents=True, exist_ok=True)
|
| 127 |
+
|
| 128 |
+
# Save FAISS index
|
| 129 |
+
faiss.write_index(self.index, str(folder / "index.faiss"))
|
| 130 |
+
|
| 131 |
+
# Save docstore and mapping
|
| 132 |
+
save_dict = {
|
| 133 |
+
"docstore": self.docstore._dict,
|
| 134 |
+
"index_to_docstore_id": self.index_to_docstore_id
|
| 135 |
+
}
|
| 136 |
+
with open(folder / "index.pkl", "wb") as f:
|
| 137 |
+
pickle.dump(save_dict, f)
|
| 138 |
+
|
| 139 |
+
@classmethod
|
| 140 |
+
def load_local(
|
| 141 |
+
cls,
|
| 142 |
+
folder_path: str,
|
| 143 |
+
embeddings,
|
| 144 |
+
allow_dangerous_deserialization: bool = False
|
| 145 |
+
) -> "FAISS":
|
| 146 |
+
"""Load a FAISS index from disk."""
|
| 147 |
+
if not allow_dangerous_deserialization:
|
| 148 |
+
raise ValueError("allow_dangerous_deserialization must be True to load pickled files")
|
| 149 |
+
|
| 150 |
+
folder = Path(folder_path)
|
| 151 |
+
|
| 152 |
+
# Load FAISS index
|
| 153 |
+
index = faiss.read_index(str(folder / "index.faiss"))
|
| 154 |
+
|
| 155 |
+
# Load docstore and mapping
|
| 156 |
+
with open(folder / "index.pkl", "rb") as f:
|
| 157 |
+
save_dict = pickle.load(f)
|
| 158 |
+
|
| 159 |
+
# Handle different pickle formats (dict or tuple)
|
| 160 |
+
if isinstance(save_dict, dict):
|
| 161 |
+
# Expected format: dictionary with keys
|
| 162 |
+
docstore_data = save_dict.get("docstore", {})
|
| 163 |
+
index_to_docstore_id = save_dict.get("index_to_docstore_id", {})
|
| 164 |
+
elif isinstance(save_dict, tuple):
|
| 165 |
+
# Legacy format: might be a tuple, try to unpack
|
| 166 |
+
# If tuple has 2 elements, assume (docstore_dict, index_to_docstore_id)
|
| 167 |
+
if len(save_dict) == 2:
|
| 168 |
+
docstore_data, index_to_docstore_id = save_dict
|
| 169 |
+
else:
|
| 170 |
+
raise ValueError(
|
| 171 |
+
f"Unexpected pickle format: tuple with {len(save_dict)} elements. "
|
| 172 |
+
f"Expected dictionary or tuple with 2 elements."
|
| 173 |
+
)
|
| 174 |
+
else:
|
| 175 |
+
raise TypeError(
|
| 176 |
+
f"Unexpected pickle format: {type(save_dict)}. "
|
| 177 |
+
f"Expected dictionary or tuple."
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# Handle docstore_data - could be a dict or InMemoryDocstore object
|
| 181 |
+
docstore = InMemoryDocstore()
|
| 182 |
+
if isinstance(docstore_data, dict):
|
| 183 |
+
# It's a dictionary, use it directly
|
| 184 |
+
docstore._dict = docstore_data
|
| 185 |
+
elif isinstance(docstore_data, InMemoryDocstore):
|
| 186 |
+
# It's already an InMemoryDocstore object, copy its _dict
|
| 187 |
+
docstore._dict = docstore_data._dict.copy()
|
| 188 |
+
else:
|
| 189 |
+
# Try to convert to dict or raise error
|
| 190 |
+
raise TypeError(
|
| 191 |
+
f"Unexpected docstore format: {type(docstore_data)}. "
|
| 192 |
+
f"Expected dictionary or InMemoryDocstore object."
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
return cls(
|
| 196 |
+
embedding_function=embeddings.embed_query,
|
| 197 |
+
index=index,
|
| 198 |
+
docstore=docstore,
|
| 199 |
+
index_to_docstore_id=index_to_docstore_id
|
| 200 |
+
)
|
| 201 |
+
|
tools/ingest.py
CHANGED
|
@@ -14,17 +14,18 @@ import zipfile
|
|
| 14 |
import tempfile
|
| 15 |
from pathlib import Path
|
| 16 |
|
| 17 |
-
from
|
| 18 |
-
|
| 19 |
-
from
|
| 20 |
-
|
| 21 |
-
from
|
| 22 |
-
from langchain.docstore.document import Document
|
| 23 |
-
#from chatfuncs.config import EMBEDDINGS_MODEL_NAME
|
| 24 |
-
from langchain_core.embeddings import Embeddings # Import Embeddings for type hinting
|
| 25 |
from tqdm import tqdm
|
| 26 |
-
from langchain_community.docstore.in_memory import InMemoryDocstore # To manually build the docstore
|
| 27 |
from uuid import uuid4 # To generate unique IDs for documents in the docstore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
from bs4 import BeautifulSoup
|
| 29 |
from docx import Document as Doc
|
| 30 |
from pypdf import PdfReader
|
|
@@ -695,7 +696,7 @@ def embed_faiss_save_to_zip(
|
|
| 695 |
raw_faiss_index = faiss.IndexFlatIP(embedding_dimension)
|
| 696 |
raw_faiss_index.add(embeddings_np) # Add all vectors to the raw FAISS index
|
| 697 |
|
| 698 |
-
# 3. Create the
|
| 699 |
# The `embedding_function` is used for subsequent queries to the vectorstore,
|
| 700 |
# not for building the initial index here (as we've already done that).
|
| 701 |
vectorstore = FAISS(
|
|
@@ -703,7 +704,6 @@ def embed_faiss_save_to_zip(
|
|
| 703 |
index=raw_faiss_index,
|
| 704 |
docstore=docstore,
|
| 705 |
index_to_docstore_id=index_to_docstore_id
|
| 706 |
-
# distance_strategy defaults to COSINE, which is appropriate for IndexFlatIP
|
| 707 |
)
|
| 708 |
# --- Progress Bar Integration Ends Here ---
|
| 709 |
|
|
|
|
| 14 |
import tempfile
|
| 15 |
from pathlib import Path
|
| 16 |
|
| 17 |
+
from tools.embeddings import HuggingFaceEmbeddings
|
| 18 |
+
from tools.faiss_store import FAISS, InMemoryDocstore
|
| 19 |
+
from tools.text_splitter import RecursiveCharacterTextSplitter
|
| 20 |
+
from tools.document import Document
|
| 21 |
+
from typing import Protocol # For type hinting
|
|
|
|
|
|
|
|
|
|
| 22 |
from tqdm import tqdm
|
|
|
|
| 23 |
from uuid import uuid4 # To generate unique IDs for documents in the docstore
|
| 24 |
+
|
| 25 |
+
# Type hint for embeddings
|
| 26 |
+
class Embeddings(Protocol):
|
| 27 |
+
def embed_documents(self, texts: List[str]) -> List[List[float]]: ...
|
| 28 |
+
def embed_query(self, text: str) -> List[float]: ...
|
| 29 |
from bs4 import BeautifulSoup
|
| 30 |
from docx import Document as Doc
|
| 31 |
from pypdf import PdfReader
|
|
|
|
| 696 |
raw_faiss_index = faiss.IndexFlatIP(embedding_dimension)
|
| 697 |
raw_faiss_index.add(embeddings_np) # Add all vectors to the raw FAISS index
|
| 698 |
|
| 699 |
+
# 3. Create the FAISS Vectorstore from the components
|
| 700 |
# The `embedding_function` is used for subsequent queries to the vectorstore,
|
| 701 |
# not for building the initial index here (as we've already done that).
|
| 702 |
vectorstore = FAISS(
|
|
|
|
| 704 |
index=raw_faiss_index,
|
| 705 |
docstore=docstore,
|
| 706 |
index_to_docstore_id=index_to_docstore_id
|
|
|
|
| 707 |
)
|
| 708 |
# --- Progress Bar Integration Ends Here ---
|
| 709 |
|
tools/text_splitter.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Custom text splitter to replace langchain RecursiveCharacterTextSplitter.
|
| 3 |
+
"""
|
| 4 |
+
from typing import List, Optional, Callable
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class RecursiveCharacterTextSplitter:
|
| 9 |
+
"""Splits text recursively by characters."""
|
| 10 |
+
|
| 11 |
+
def __init__(
|
| 12 |
+
self,
|
| 13 |
+
chunk_size: int = 1000,
|
| 14 |
+
chunk_overlap: int = 200,
|
| 15 |
+
separators: Optional[List[str]] = None,
|
| 16 |
+
length_function: Optional[Callable[[str], int]] = None,
|
| 17 |
+
add_start_index: bool = False
|
| 18 |
+
):
|
| 19 |
+
self.chunk_size = chunk_size
|
| 20 |
+
self.chunk_overlap = chunk_overlap
|
| 21 |
+
self.separators = separators if separators else ["\n\n", "\n", ". ", "! ", "? ", " ", ""]
|
| 22 |
+
self.length_function = length_function if length_function else len
|
| 23 |
+
self.add_start_index = add_start_index
|
| 24 |
+
|
| 25 |
+
def split_text(self, text: str) -> List[str]:
|
| 26 |
+
"""Split text into chunks."""
|
| 27 |
+
if not text:
|
| 28 |
+
return []
|
| 29 |
+
|
| 30 |
+
# Start with the full text
|
| 31 |
+
splits = [text]
|
| 32 |
+
|
| 33 |
+
# Try each separator in order
|
| 34 |
+
for separator in self.separators:
|
| 35 |
+
if not separator:
|
| 36 |
+
# Last separator - split by character
|
| 37 |
+
new_splits = []
|
| 38 |
+
for split in splits:
|
| 39 |
+
if self.length_function(split) <= self.chunk_size:
|
| 40 |
+
new_splits.append(split)
|
| 41 |
+
else:
|
| 42 |
+
# Split by character
|
| 43 |
+
for i in range(0, len(split), self.chunk_size - self.chunk_overlap):
|
| 44 |
+
chunk = split[i:i + self.chunk_size]
|
| 45 |
+
if chunk:
|
| 46 |
+
new_splits.append(chunk)
|
| 47 |
+
splits = new_splits
|
| 48 |
+
break
|
| 49 |
+
|
| 50 |
+
new_splits = []
|
| 51 |
+
for split in splits:
|
| 52 |
+
if self.length_function(split) <= self.chunk_size:
|
| 53 |
+
new_splits.append(split)
|
| 54 |
+
else:
|
| 55 |
+
# Split by separator
|
| 56 |
+
parts = split.split(separator)
|
| 57 |
+
current_chunk = ""
|
| 58 |
+
for part in parts:
|
| 59 |
+
part_with_sep = part if not current_chunk else separator + part
|
| 60 |
+
if self.length_function(current_chunk + part_with_sep) <= self.chunk_size:
|
| 61 |
+
current_chunk += part_with_sep
|
| 62 |
+
else:
|
| 63 |
+
if current_chunk:
|
| 64 |
+
new_splits.append(current_chunk)
|
| 65 |
+
current_chunk = part_with_sep
|
| 66 |
+
if current_chunk:
|
| 67 |
+
new_splits.append(current_chunk)
|
| 68 |
+
splits = new_splits
|
| 69 |
+
|
| 70 |
+
# If all splits are small enough, we're done
|
| 71 |
+
if all(self.length_function(s) <= self.chunk_size for s in splits):
|
| 72 |
+
break
|
| 73 |
+
|
| 74 |
+
# Apply overlap
|
| 75 |
+
if self.chunk_overlap > 0 and len(splits) > 1:
|
| 76 |
+
overlapped_splits = []
|
| 77 |
+
for i, split in enumerate(splits):
|
| 78 |
+
if i == 0:
|
| 79 |
+
overlapped_splits.append(split)
|
| 80 |
+
else:
|
| 81 |
+
# Add overlap from previous chunk
|
| 82 |
+
prev_chunk = splits[i - 1]
|
| 83 |
+
overlap_text = prev_chunk[-self.chunk_overlap:] if len(prev_chunk) > self.chunk_overlap else prev_chunk
|
| 84 |
+
overlapped_splits.append(overlap_text + split)
|
| 85 |
+
splits = overlapped_splits
|
| 86 |
+
|
| 87 |
+
return splits
|
| 88 |
+
|
| 89 |
+
def create_documents(
|
| 90 |
+
self,
|
| 91 |
+
texts: List[str],
|
| 92 |
+
metadatas: Optional[List[dict]] = None
|
| 93 |
+
) -> List:
|
| 94 |
+
"""Create Document objects from texts."""
|
| 95 |
+
from tools.document import Document
|
| 96 |
+
|
| 97 |
+
all_docs = []
|
| 98 |
+
metadatas = metadatas if metadatas else [{}] * len(texts)
|
| 99 |
+
|
| 100 |
+
for text, metadata in zip(texts, metadatas):
|
| 101 |
+
splits = self.split_text(text)
|
| 102 |
+
for i, split in enumerate(splits):
|
| 103 |
+
doc_metadata = metadata.copy()
|
| 104 |
+
if self.add_start_index:
|
| 105 |
+
# Find start index in original text
|
| 106 |
+
start_idx = text.find(split)
|
| 107 |
+
if start_idx != -1:
|
| 108 |
+
doc_metadata["start_index"] = start_idx
|
| 109 |
+
all_docs.append(Document(page_content=split, metadata=doc_metadata))
|
| 110 |
+
|
| 111 |
+
return all_docs
|
| 112 |
+
|