seanpedrickcase commited on
Commit
5b2f824
·
1 Parent(s): 48d391e

Removed langchain and llama-cpp-python (not actively supported anymore) dependencies. Updated packages. Updated default dataset

Browse files
Dockerfile CHANGED
@@ -27,7 +27,6 @@ COPY requirements_aws.txt .
27
  RUN pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
28
  && pip install --no-cache-dir --target=/install sentence-transformers==4.1.0 --no-deps \
29
  && pip install --no-cache-dir --target=/install span-marker==1.7.0 --no-deps \
30
- && pip install --no-cache-dir --target=/install langchain-huggingface==0.1.2 --no-deps \
31
  && pip install --no-cache-dir --target=/install keybert==0.9.0 --no-deps \
32
  && pip install --no-cache-dir --target=/install -r requirements_aws.txt
33
 
 
27
  RUN pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
28
  && pip install --no-cache-dir --target=/install sentence-transformers==4.1.0 --no-deps \
29
  && pip install --no-cache-dir --target=/install span-marker==1.7.0 --no-deps \
 
30
  && pip install --no-cache-dir --target=/install keybert==0.9.0 --no-deps \
31
  && pip install --no-cache-dir --target=/install -r requirements_aws.txt
32
 
app.py CHANGED
@@ -1,11 +1,10 @@
1
  import os
2
  from typing import Type
3
- #from langchain_huggingface.embeddings import HuggingFaceEmbeddings
4
- from langchain_community.vectorstores import FAISS
5
  import gradio as gr
6
  import pandas as pd
7
  from torch import float16, float32
8
- from llama_cpp import Llama
9
  from huggingface_hub import hf_hub_download
10
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
11
 
@@ -13,7 +12,7 @@ from tools.ingest import embed_faiss_save_to_zip, load_embeddings_model, get_fai
13
  from tools.helper_functions import get_connection_params, reveal_feedback_buttons, wipe_logs
14
  from tools.aws_functions import upload_file_to_s3
15
  from tools.auth import authenticate_user
16
- from tools.config import FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, HOST_NAME, COGNITO_AUTH, INPUT_FOLDER, OUTPUT_FOLDER, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, DEFAULT_EMBEDDINGS_LOCATION, EMBEDDINGS_MODEL_NAME, DEFAULT_DATA_SOURCE, HF_TOKEN, LARGE_MODEL_REPO_ID, LARGE_MODEL_GGUF_FILE, LARGE_MODEL_NAME, SMALL_MODEL_NAME, SMALL_MODEL_REPO_ID, DEFAULT_DATA_SOURCE_NAME, DEFAULT_EXAMPLES, DEFAULT_MODEL_CHOICES, RUN_GEMINI_MODELS, LOAD_LARGE_MODEL
17
  from tools.model_load import torch_device, gpu_config, cpu_config, context_length
18
  import tools.chatfuncs as chatf
19
  import tools.ingest as ing
@@ -39,10 +38,11 @@ if isinstance(DEFAULT_MODEL_CHOICES, str): default_model_choices = eval(DEFAULT_
39
  ###
40
  # Load in default embeddings and embeddings model name
41
  embeddings_model = load_embeddings_model(EMBEDDINGS_MODEL_NAME)
42
- vectorstore = get_faiss_store(zip_file_path=DEFAULT_EMBEDDINGS_LOCATION,embeddings_model=embeddings_model)#globals()["embeddings"])
 
43
 
44
  chatf.embeddings = embeddings_model
45
- chatf.vectorstore = vectorstore
46
 
47
  def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings_model=embeddings_model):
48
 
@@ -64,17 +64,17 @@ def create_hf_model(model_name:str, hf_token=HF_TOKEN):
64
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
65
  else:
66
  if hf_token:
67
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=hf_token, torch_dtype=float32) # , torch_dtype=float16 - not compatible with CPU and Gemma 3
68
  else:
69
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=float32) # , torch_dtype=float16
70
  else:
71
  if "flan" in model_name:
72
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
73
  else:
74
  if hf_token:
75
- model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token, torch_dtype=float32) # , torch_dtype=float16
76
  else:
77
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=float32) # , torch_dtype=float16
78
 
79
  if hf_token:
80
  tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = context_length, token=hf_token)
@@ -97,6 +97,7 @@ def load_model(model_type:str, gpu_layers:int, gpu_config:dict=gpu_config, cpu_c
97
  print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.")
98
 
99
  try:
 
100
  model = Llama(
101
  model_path=hf_hub_download(
102
  repo_id=LARGE_MODEL_REPO_ID,
@@ -248,9 +249,9 @@ with app:
248
  with gr.Column(scale=3):
249
  model_choice = gr.Radio(label="Choose a chat model", value=SMALL_MODEL_NAME, choices = default_model_choices)
250
  if RUN_GEMINI_MODELS == "1":
251
- in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password",interactive=True, visible=True)
252
  else:
253
- in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password",interactive=True, visible=False)
254
  with gr.Column(scale=1):
255
  change_model_button = gr.Button(value="Load model")
256
 
@@ -264,7 +265,7 @@ with app:
264
  load_text = gr.Text(label="Load status")
265
 
266
  gr.HTML(
267
- "<center>This app is powered by Gradio, Transformers, and Llama.cpp.</center>"
268
  )
269
 
270
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
 
1
  import os
2
  from typing import Type
3
+ from tools.faiss_store import FAISS
 
4
  import gradio as gr
5
  import pandas as pd
6
  from torch import float16, float32
7
+
8
  from huggingface_hub import hf_hub_download
9
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
10
 
 
12
  from tools.helper_functions import get_connection_params, reveal_feedback_buttons, wipe_logs
13
  from tools.aws_functions import upload_file_to_s3
14
  from tools.auth import authenticate_user
15
+ from tools.config import FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, HOST_NAME, COGNITO_AUTH, INPUT_FOLDER, OUTPUT_FOLDER, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, DEFAULT_EMBEDDINGS_LOCATION, EMBEDDINGS_MODEL_NAME, DEFAULT_DATA_SOURCE, HF_TOKEN, LARGE_MODEL_REPO_ID, LARGE_MODEL_GGUF_FILE, LARGE_MODEL_NAME, SMALL_MODEL_NAME, SMALL_MODEL_REPO_ID, DEFAULT_DATA_SOURCE_NAME, DEFAULT_EXAMPLES, DEFAULT_MODEL_CHOICES, RUN_GEMINI_MODELS, LOAD_LARGE_MODEL, GEMINI_API_KEY
16
  from tools.model_load import torch_device, gpu_config, cpu_config, context_length
17
  import tools.chatfuncs as chatf
18
  import tools.ingest as ing
 
38
  ###
39
  # Load in default embeddings and embeddings model name
40
  embeddings_model = load_embeddings_model(EMBEDDINGS_MODEL_NAME)
41
+ # vectorstore = get_faiss_store(zip_file_path=DEFAULT_EMBEDDINGS_LOCATION,embeddings_model=embeddings_model)#globals()["embeddings"])
42
+ vectorstore = None
43
 
44
  chatf.embeddings = embeddings_model
45
+ # chatf.vectorstore = vectorstore
46
 
47
  def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings_model=embeddings_model):
48
 
 
64
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
65
  else:
66
  if hf_token:
67
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=hf_token) # , torch_dtype=float16 - not compatible with CPU and Gemma 3
68
  else:
69
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") # , torch_dtype=float16
70
  else:
71
  if "flan" in model_name:
72
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
73
  else:
74
  if hf_token:
75
+ model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token) # , torch_dtype=float16
76
  else:
77
+ model = AutoModelForCausalLM.from_pretrained(model_name) # , torch_dtype=float16
78
 
79
  if hf_token:
80
  tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = context_length, token=hf_token)
 
97
  print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.")
98
 
99
  try:
100
+ from llama_cpp import Llama
101
  model = Llama(
102
  model_path=hf_hub_download(
103
  repo_id=LARGE_MODEL_REPO_ID,
 
249
  with gr.Column(scale=3):
250
  model_choice = gr.Radio(label="Choose a chat model", value=SMALL_MODEL_NAME, choices = default_model_choices)
251
  if RUN_GEMINI_MODELS == "1":
252
+ in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password",interactive=True, visible=True)
253
  else:
254
+ in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password",interactive=True, visible=False)
255
  with gr.Column(scale=1):
256
  change_model_button = gr.Button(value="Load model")
257
 
 
265
  load_text = gr.Text(label="Load status")
266
 
267
  gr.HTML(
268
+ "<center>This app is powered by Gradio and Transformers.</center>"
269
  )
270
 
271
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
faiss_embedding/faiss_embedding.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20d1c95c817837b8538657ded2fbc840677ccb28fa92becf8d678d51f5662199
3
- size 286111
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e9d58ea966d7fb5bf05c6d13217ab0a4f15c18607976b3ed443b6bd163b390e
3
+ size 293425
requirements.txt CHANGED
@@ -1,26 +1,24 @@
1
- langchain==0.3.24
2
- langchain-huggingface==0.1.2
3
- langchain-community==0.3.22
4
  beautifulsoup4==4.13.4
5
- google-generativeai==0.8.5
6
  pandas==2.2.3
7
- transformers==4.51.3
 
8
  # For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
9
- llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
10
  #llama-cpp-python==0.3.9 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" # CPU
11
  #llama-cpp-python==0.3.9 -C cmake.args="-DGGML_CUDA=on" # With CUDA
12
- torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
13
- sentence_transformers==4.1.0
14
  faiss-cpu==1.10.0
15
  pypdf==5.4.0
16
  python-docx==1.1.2
17
  keybert==0.9.0
18
  span-marker==1.7.0
19
- gradio==5.25.2
20
  nltk==3.9.1
21
- bm25s==0.2.12
22
  PyStemmer==2.2.0.3
23
- scipy==1.15.2
24
- numpy==1.26.4
25
- boto3==1.38.0
26
  python-dotenv==1.1.0
 
 
 
 
1
  beautifulsoup4==4.13.4
2
+ google-genai==1.50.0
3
  pandas==2.2.3
4
+ markdown==3.8.1
5
+ transformers==4.57.1
6
  # For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
7
+ # llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
8
  #llama-cpp-python==0.3.9 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" # CPU
9
  #llama-cpp-python==0.3.9 -C cmake.args="-DGGML_CUDA=on" # With CUDA
10
+ torch>=2.6.0 --extra-index-url https://download.pytorch.org/whl/cpu
11
+ sentence_transformers==5.1.2
12
  faiss-cpu==1.10.0
13
  pypdf==5.4.0
14
  python-docx==1.1.2
15
  keybert==0.9.0
16
  span-marker==1.7.0
17
+ gradio==5.49.1
18
  nltk==3.9.1
19
+ bm25s==0.2.14
20
  PyStemmer==2.2.0.3
21
+ scipy==1.16.3
22
+ numpy==2.2.6
23
+ boto3==1.40.72
24
  python-dotenv==1.1.0
requirements_aws.txt CHANGED
@@ -1,15 +1,13 @@
1
- #langchain==0.3.24
2
- #langchain-huggingface==0.1.2 # Loaded in Dockerfile
3
  boto3==1.38.0
4
  python-dotenv==1.1.0
5
- langchain-community==0.3.22
6
  beautifulsoup4==4.13.4
7
- google-generativeai==0.8.5
8
  pandas==2.2.3
9
- transformers==4.51.3
 
10
  # For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
11
  #llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
12
- llama-cpp-python==0.3.8 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
13
  #torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu # Loaded in Dockerfile
14
  #sentence_transformers==4.1.0 # Loaded in Dockerfile
15
  faiss-cpu==1.10.0
@@ -17,11 +15,11 @@ pypdf==5.4.0
17
  python-docx==1.1.2
18
  #keybert==0.9.0 # Loaded in Dockerfile
19
  #span-marker==1.7.0 # Loaded in Dockerfile
20
- gradio==5.25.2
21
  nltk==3.9.1
22
- bm25s==0.2.12
23
  PyStemmer==2.2.0.3
24
- scikit-learn==1.6.1
25
- scipy==1.15.2
26
- numpy==1.26.4
27
 
 
 
 
1
  boto3==1.38.0
2
  python-dotenv==1.1.0
 
3
  beautifulsoup4==4.13.4
4
+ google-genai==1.50.0
5
  pandas==2.2.3
6
+ markdown==3.8.1
7
+ transformers==4.57.1
8
  # For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
9
  #llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux if dependencies for below build command are not available in the environment
10
+ # llama-cpp-python==0.3.16 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
11
  #torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu # Loaded in Dockerfile
12
  #sentence_transformers==4.1.0 # Loaded in Dockerfile
13
  faiss-cpu==1.10.0
 
15
  python-docx==1.1.2
16
  #keybert==0.9.0 # Loaded in Dockerfile
17
  #span-marker==1.7.0 # Loaded in Dockerfile
18
+ gradio==5.49.1
19
  nltk==3.9.1
20
+ bm25s==0.2.14
21
  PyStemmer==2.2.0.3
22
+ scikit-learn==1.7.2
23
+ scipy==1.16.3
24
+ numpy==2.2.6
25
 
requirements_gpu.txt CHANGED
@@ -1,24 +1,22 @@
1
- #langchain==0.3.24
2
- langchain-community==0.3.22
3
- langchain-huggingface==0.1.2
4
  beautifulsoup4==4.13.4
5
- google-generativeai==0.8.5
6
  pandas==2.2.3
7
- transformers==4.51.3
8
- torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cu121
9
- #llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
10
- llama-cpp-python==0.3.8 -C cmake.args="-DGGML_CUDA=on"
11
- sentence_transformers==4.1.0
 
12
  faiss-cpu==1.10.0
13
  pypdf==5.4.0
14
  python-docx==1.1.2
15
  keybert==0.9.0
16
  span-marker==1.7.0
17
- gradio==5.25.2
18
  nltk==3.9.1
19
- bm25s==0.2.12
20
  PyStemmer==2.2.0.3
21
- scipy==1.15.2
22
- numpy==1.26.4
23
- boto3==1.38.0
24
  python-dotenv==1.1.0
 
 
 
 
1
  beautifulsoup4==4.13.4
2
+ google-genai==1.50.0
3
  pandas==2.2.3
4
+ markdown==3.8.1
5
+ transformers==4.57.1
6
+ torch>=2.6.0 --extra-index-url https://download.pytorch.org/whl/cu126
7
+ #llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
8
+ # llama-cpp-python==0.3.16 -C cmake.args="-DGGML_CUDA=on"
9
+ sentence_transformers==5.1.2
10
  faiss-cpu==1.10.0
11
  pypdf==5.4.0
12
  python-docx==1.1.2
13
  keybert==0.9.0
14
  span-marker==1.7.0
15
+ gradio==5.49.1
16
  nltk==3.9.1
17
+ bm25s==0.2.14
18
  PyStemmer==2.2.0.3
19
+ scipy==1.16.3
20
+ numpy==2.2.6
21
+ boto3==1.40.72
22
  python-dotenv==1.1.0
tools/chatfuncs.py CHANGED
@@ -1,11 +1,12 @@
1
  import re
2
  import os
3
  import datetime
4
- from typing import Type, Dict, List, Tuple
5
  import time
6
  from itertools import compress
7
  import pandas as pd
8
- import google.generativeai as ai
 
9
  import gradio as gr
10
  from gradio import Progress
11
  import boto3
@@ -14,7 +15,10 @@ from nltk.corpus import stopwords
14
  from nltk.tokenize import RegexpTokenizer
15
  from nltk.stem import WordNetLemmatizer
16
  from keybert import KeyBERT
17
- from langchain_huggingface.embeddings import HuggingFaceEmbeddings
 
 
 
18
 
19
  # For Name Entity Recognition model
20
  #from span_marker import SpanMarkerModel # Not currently used
@@ -26,12 +30,6 @@ import Stemmer
26
  import torch.cuda
27
  from threading import Thread
28
  from transformers import pipeline, TextIteratorStreamer
29
- # Langchain functions
30
- from langchain.prompts import PromptTemplate
31
- from langchain_community.vectorstores import FAISS
32
- from langchain_community.retrievers import SVMRetriever
33
- from langchain.text_splitter import RecursiveCharacterTextSplitter
34
- from langchain.docstore.document import Document
35
 
36
  from tools.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3, instruction_prompt_qwen, instruction_prompt_template_orca, instruction_prompt_gemma, instruction_prompt_template_gemini_aws
37
  from tools.model_load import temperature, max_new_tokens, sample, repetition_penalty, top_p, top_k, torch_device, CtransGenGenerationConfig, max_tokens
@@ -79,26 +77,19 @@ kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniL
79
 
80
  def base_prompt_templates(model_type:str = SMALL_MODEL_NAME):
81
 
82
- #EXAMPLE_PROMPT = PromptTemplate(
83
- # template="\nCONTENT:\n\n{page_content}\n\nSOURCE: {source}\n\n",
84
- # input_variables=["page_content", "source"],
85
- #)
86
-
87
- CONTENT_PROMPT = PromptTemplate(
88
- template="{page_content}\n\n",#\n\nSOURCE: {source}\n\n",
89
- input_variables=["page_content"]
90
- )
91
 
92
  # The main prompt:
93
 
94
  if model_type == SMALL_MODEL_NAME:
95
- INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_gemma, input_variables=['question', 'summaries'])
96
  elif model_type == LARGE_MODEL_NAME:
97
- INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_phi3, input_variables=['question', 'summaries'])
98
  else:
99
- INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_gemini_aws, input_variables=['question', 'summaries'])
100
 
101
- return INSTRUCTION_PROMPT, CONTENT_PROMPT
102
 
103
  def write_out_metadata_as_string(metadata_in:str):
104
  metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
@@ -175,7 +166,7 @@ def generate_expanded_prompt(
175
 
176
  sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
177
 
178
- instruction_prompt_out = instruction_prompt.format(question=new_question_kworded, summaries=docs_content_string)
179
 
180
  return instruction_prompt_out, sources_docs_content_string, new_question_kworded
181
 
@@ -269,9 +260,9 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
269
 
270
  return response
271
 
272
- def construct_gemini_generative_model(in_api_key: str, temperature: float, model_choice: str, system_prompt: str, max_tokens: int) -> Tuple[object, dict]:
273
  """
274
- Constructs a GenerativeModel for Gemini API calls.
275
 
276
  Parameters:
277
  - in_api_key (str): The API key for authentication.
@@ -279,34 +270,37 @@ def construct_gemini_generative_model(in_api_key: str, temperature: float, model
279
  - model_choice (str): The choice of model to use for generation.
280
  - system_prompt (str): The system prompt to guide the generation.
281
  - max_tokens (int): The maximum number of tokens to generate.
 
282
 
283
  Returns:
284
- - Tuple[object, dict]: A tuple containing the constructed GenerativeModel and its configuration.
285
  """
286
- # Construct a GenerativeModel
287
  try:
288
  if in_api_key:
289
  #print("Getting API key from textbox")
290
  api_key = in_api_key
291
- ai.configure(api_key=api_key)
292
  elif "GOOGLE_API_KEY" in os.environ:
293
  #print("Searching for API key in environmental variables")
294
  api_key = os.environ["GOOGLE_API_KEY"]
295
- ai.configure(api_key=api_key)
296
  else:
297
- print("No API key foound")
298
  raise gr.Error("No API key found.")
299
  except Exception as e:
300
  print(e)
 
301
 
302
- config = ai.GenerationConfig(temperature=temperature, max_output_tokens=max_tokens)
 
 
 
 
303
 
304
  print("model_choice:", model_choice)
305
-
306
- #model = ai.GenerativeModel.from_cached_content(cached_content=cache, generation_config=config)
307
- model = ai.GenerativeModel(model_name=model_choice, system_instruction=system_prompt, generation_config=config)
308
 
309
- return model, config
310
 
311
  # Function to send a request and update history
312
  def send_request(prompt: str, conversation_history: List[dict], model: object, config: dict, model_choice: str, system_prompt: str, temperature: float, progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
@@ -333,7 +327,15 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
333
  # Generate the model's response
334
  if "gemini" in model_choice:
335
  try:
336
- response = model.generate_content(contents=full_prompt, generation_config=config)
 
 
 
 
 
 
 
 
337
  except Exception as e:
338
  # If fails, try again after 10 seconds in case there is a throttle limit
339
  print(e)
@@ -343,7 +345,14 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
343
  print(out_message)
344
  progress(0.5, desc=out_message)
345
  time.sleep(30)
346
- response = model.generate_content(contents=full_prompt, generation_config=config)
 
 
 
 
 
 
 
347
  except Exception as e:
348
  print(e)
349
  return "", conversation_history
@@ -559,7 +568,7 @@ def produce_streaming_answer_chatbot(
559
  history.append({"metadata":None, "options":None, "role": "assistant", "content": ''})
560
 
561
  for char in clean_response_text:
562
- time.sleep(0.005)
563
  history[-1]['content'] += char
564
  yield history
565
 
@@ -594,7 +603,7 @@ def produce_streaming_answer_chatbot(
594
  history.append({"metadata":None, "options":None, "role": "assistant", "content": ''})
595
 
596
  for char in clean_response_text:
597
- time.sleep(0.005)
598
  history[-1]['content'] += char
599
  yield history
600
 
@@ -795,31 +804,29 @@ def hybrid_retrieval(
795
 
796
 
797
  # 3rd level check on retrieved docs with SVM retriever
798
- # Check the type of the embeddings_model object
799
- embeddings_type = type(embeddings_model)
800
-
801
-
802
- #hf_embeddings = HuggingFaceEmbeddings(**embeddings)
803
- hf_embeddings = embeddings_model
804
-
805
- svm_retriever = SVMRetriever.from_texts(content_keep, hf_embeddings, k = k_val)
806
- svm_result = svm_retriever.invoke(new_question_kworded)
807
-
808
-
809
- svm_rank=[]
810
  svm_score = []
811
-
812
- for vec_item in docs_keep:
813
- x = 0
814
- for svm_item in svm_result:
815
- x = x + 1
816
- if svm_item.page_content == vec_item[0].page_content:
817
- svm_rank.append(x)
818
- svm_score.append((docs_keep_length/x)*svm_weight)
 
 
 
 
 
819
 
820
 
821
- ## Calculate final score based on three ranking methods
822
- final_score = [a + b + c for a, b, c in zip(vec_score, bm25_score, svm_score)]
 
 
823
  final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
824
  # Force final_rank to increment by 1 each time
825
  final_rank = list(pd.Series(final_rank).rank(method='first'))
 
1
  import re
2
  import os
3
  import datetime
4
+ from typing import Type, Dict, List, Tuple, Union
5
  import time
6
  from itertools import compress
7
  import pandas as pd
8
+ from google import genai as ai
9
+ from google.genai import types
10
  import gradio as gr
11
  from gradio import Progress
12
  import boto3
 
15
  from nltk.tokenize import RegexpTokenizer
16
  from nltk.stem import WordNetLemmatizer
17
  from keybert import KeyBERT
18
+ from tools.embeddings import HuggingFaceEmbeddings
19
+ from tools.faiss_store import FAISS
20
+ from tools.text_splitter import RecursiveCharacterTextSplitter
21
+ from tools.document import Document
22
 
23
  # For Name Entity Recognition model
24
  #from span_marker import SpanMarkerModel # Not currently used
 
30
  import torch.cuda
31
  from threading import Thread
32
  from transformers import pipeline, TextIteratorStreamer
 
 
 
 
 
 
33
 
34
  from tools.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3, instruction_prompt_qwen, instruction_prompt_template_orca, instruction_prompt_gemma, instruction_prompt_template_gemini_aws
35
  from tools.model_load import temperature, max_new_tokens, sample, repetition_penalty, top_p, top_k, torch_device, CtransGenGenerationConfig, max_tokens
 
77
 
78
  def base_prompt_templates(model_type:str = SMALL_MODEL_NAME):
79
 
80
+ # Simple string template for content
81
+ CONTENT_PROMPT_TEMPLATE = "{page_content}\n\n"
 
 
 
 
 
 
 
82
 
83
  # The main prompt:
84
 
85
  if model_type == SMALL_MODEL_NAME:
86
+ INSTRUCTION_PROMPT_TEMPLATE = instruction_prompt_gemma
87
  elif model_type == LARGE_MODEL_NAME:
88
+ INSTRUCTION_PROMPT_TEMPLATE = instruction_prompt_phi3
89
  else:
90
+ INSTRUCTION_PROMPT_TEMPLATE = instruction_prompt_template_gemini_aws
91
 
92
+ return INSTRUCTION_PROMPT_TEMPLATE, CONTENT_PROMPT_TEMPLATE
93
 
94
  def write_out_metadata_as_string(metadata_in:str):
95
  metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
 
166
 
167
  sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
168
 
169
+ instruction_prompt_out = instruction_prompt.replace('{question}', new_question_kworded).replace('{summaries}', docs_content_string)
170
 
171
  return instruction_prompt_out, sources_docs_content_string, new_question_kworded
172
 
 
260
 
261
  return response
262
 
263
+ def construct_gemini_generative_model(in_api_key: str, temperature: float, model_choice: str, system_prompt: str, max_tokens: int, random_seed: int = None) -> Tuple[object, dict]:
264
  """
265
+ Constructs a Client for Gemini API calls using the new google.genai package.
266
 
267
  Parameters:
268
  - in_api_key (str): The API key for authentication.
 
270
  - model_choice (str): The choice of model to use for generation.
271
  - system_prompt (str): The system prompt to guide the generation.
272
  - max_tokens (int): The maximum number of tokens to generate.
273
+ - random_seed (int, optional): Random seed for reproducibility.
274
 
275
  Returns:
276
+ - Tuple[object, dict]: A tuple containing the constructed Client and its configuration.
277
  """
278
+ # Construct a Client for the new API
279
  try:
280
  if in_api_key:
281
  #print("Getting API key from textbox")
282
  api_key = in_api_key
283
+ client = ai.Client(api_key=api_key)
284
  elif "GOOGLE_API_KEY" in os.environ:
285
  #print("Searching for API key in environmental variables")
286
  api_key = os.environ["GOOGLE_API_KEY"]
287
+ client = ai.Client(api_key=api_key)
288
  else:
289
+ print("No API key found")
290
  raise gr.Error("No API key found.")
291
  except Exception as e:
292
  print(e)
293
+ raise
294
 
295
+ # Create config with optional random_seed
296
+ config_kwargs = {"temperature": temperature, "max_output_tokens": max_tokens}
297
+ if random_seed is not None:
298
+ config_kwargs["seed"] = random_seed
299
+ config = types.GenerateContentConfig(**config_kwargs)
300
 
301
  print("model_choice:", model_choice)
 
 
 
302
 
303
+ return client, config
304
 
305
  # Function to send a request and update history
306
  def send_request(prompt: str, conversation_history: List[dict], model: object, config: dict, model_choice: str, system_prompt: str, temperature: float, progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
 
327
  # Generate the model's response
328
  if "gemini" in model_choice:
329
  try:
330
+ # New API: client.models.generate_content instead of model.generate_content
331
+ gemini_response = model.models.generate_content(model=model_choice, contents=full_prompt, config=config)
332
+ # Wrap response in ResponseObject for backwards compatibility
333
+ usage_metadata = {}
334
+ if hasattr(gemini_response, 'usage_metadata'):
335
+ usage_metadata = gemini_response.usage_metadata
336
+ elif hasattr(gemini_response, 'usage'):
337
+ usage_metadata = gemini_response.usage
338
+ response = ResponseObject(text=gemini_response.text, usage_metadata=usage_metadata)
339
  except Exception as e:
340
  # If fails, try again after 10 seconds in case there is a throttle limit
341
  print(e)
 
345
  print(out_message)
346
  progress(0.5, desc=out_message)
347
  time.sleep(30)
348
+ gemini_response = model.models.generate_content(model=model_choice, contents=full_prompt, config=config)
349
+ # Wrap response in ResponseObject for backwards compatibility
350
+ usage_metadata = {}
351
+ if hasattr(gemini_response, 'usage_metadata'):
352
+ usage_metadata = gemini_response.usage_metadata
353
+ elif hasattr(gemini_response, 'usage'):
354
+ usage_metadata = gemini_response.usage
355
+ response = ResponseObject(text=gemini_response.text, usage_metadata=usage_metadata)
356
  except Exception as e:
357
  print(e)
358
  return "", conversation_history
 
568
  history.append({"metadata":None, "options":None, "role": "assistant", "content": ''})
569
 
570
  for char in clean_response_text:
571
+ time.sleep(0.001)
572
  history[-1]['content'] += char
573
  yield history
574
 
 
603
  history.append({"metadata":None, "options":None, "role": "assistant", "content": ''})
604
 
605
  for char in clean_response_text:
606
+ time.sleep(0.001)
607
  history[-1]['content'] += char
608
  yield history
609
 
 
804
 
805
 
806
  # 3rd level check on retrieved docs with SVM retriever
807
+ # Note: SVM retriever removed - using vector similarity only
808
+ # If svm_weight > 0, we'll use a simple ranking based on vector similarity
809
+ svm_rank = []
 
 
 
 
 
 
 
 
 
810
  svm_score = []
811
+
812
+ if svm_weight > 0:
813
+ # Use vector similarity ranking as a proxy for SVM ranking
814
+ # This maintains the same interface but uses vector scores
815
+ for i, vec_item in enumerate(docs_keep):
816
+ # Use inverse rank (lower rank = higher score)
817
+ rank = i + 1
818
+ svm_rank.append(rank)
819
+ svm_score.append((docs_keep_length/rank)*svm_weight)
820
+ else:
821
+ # If svm_weight is 0, set all scores to 0
822
+ svm_rank = [0] * docs_keep_length
823
+ svm_score = [0.0] * docs_keep_length
824
 
825
 
826
+ ## Calculate final score based on ranking methods (vector, BM25, and optionally SVM)
827
+ # Ensure all lists have the same length
828
+ min_len = min(len(vec_score), len(bm25_score), len(svm_score))
829
+ final_score = [a + b + c for a, b, c in zip(vec_score[:min_len], bm25_score[:min_len], svm_score[:min_len])]
830
  final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
831
  # Force final_rank to increment by 1 each time
832
  final_rank = list(pd.Series(final_rank).rank(method='first'))
tools/config.py CHANGED
@@ -189,9 +189,9 @@ LOAD_LARGE_MODEL = get_or_create_env_var("LOAD_LARGE_MODEL", '0')
189
 
190
  LARGE_MODEL_NAME = get_or_create_env_var("LARGE_MODEL_NAME", "Phi 3.5 Mini (larger, slow)")
191
 
192
- LARGE_MODEL_REPO_ID = get_or_create_env_var("LARGE_MODEL_REPO_ID", "QuantFactory/Phi-3.5-mini-instruct-GGUF") # "QuantFactory/Phi-3-mini-128k-instruct-GGUF"), # "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"), #"microsoft/Phi-3-mini-4k-instruct-gguf"),#"TheBloke/Mistral-7B-OpenOrca-GGUF"),
193
 
194
- LARGE_MODEL_GGUF_FILE = get_or_create_env_var("LARGE_MODEL_GGUF_FILE", "Phi-3.5-mini-instruct.Q4_K_M.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf") #"Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf") #"Phi-3-mini-4k-instruct-q4.gguf")#"mistral-7b-openorca.Q4_K_M.gguf"),
195
 
196
  # Build up options for models
197
  default_model_choices = [SMALL_MODEL_NAME]
 
189
 
190
  LARGE_MODEL_NAME = get_or_create_env_var("LARGE_MODEL_NAME", "Phi 3.5 Mini (larger, slow)")
191
 
192
+ LARGE_MODEL_REPO_ID = get_or_create_env_var("LARGE_MODEL_REPO_ID", "QuantFactory/Phi-3.5-mini-instruct-GGUF") # THIS METHOD IS DEPRECATED AND WILL NO LONGER BE USED IN FUTURE (Llama-cpp-python is no longer being updated)
193
 
194
+ LARGE_MODEL_GGUF_FILE = get_or_create_env_var("LARGE_MODEL_GGUF_FILE", "Phi-3.5-mini-instruct.Q4_K_M.gguf")
195
 
196
  # Build up options for models
197
  default_model_choices = [SMALL_MODEL_NAME]
tools/document.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom Document class to replace langchain Document.
3
+ """
4
+ from typing import Dict, Any, Optional
5
+
6
+
7
+ class Document:
8
+ """A simple document class with page_content and metadata."""
9
+
10
+ def __init__(self, page_content: str, metadata: Optional[Dict[str, Any]] = None):
11
+ self.page_content = page_content
12
+ self.metadata = metadata if metadata is not None else {}
13
+
14
+ def __repr__(self):
15
+ return f"Document(page_content='{self.page_content[:50]}...', metadata={self.metadata})"
16
+
tools/embeddings.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom embeddings wrapper using sentence-transformers to replace langchain HuggingFaceEmbeddings.
3
+ """
4
+ from typing import List, Union
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+
8
+ class HuggingFaceEmbeddings:
9
+ """Wrapper around SentenceTransformer to match langchain interface."""
10
+
11
+ def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", **kwargs):
12
+ self.model_name = model_name
13
+ self.model = SentenceTransformer(model_name, **kwargs)
14
+
15
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
16
+ """Embed a list of documents."""
17
+ embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
18
+ return embeddings.tolist()
19
+
20
+ def embed_query(self, text: str) -> List[float]:
21
+ """Embed a single query."""
22
+ embedding = self.model.encode([text], convert_to_numpy=True, show_progress_bar=False)
23
+ return embedding[0].tolist()
24
+
tools/faiss_store.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom FAISS vectorstore to replace langchain FAISS.
3
+ """
4
+ import os
5
+ import pickle
6
+ import tempfile
7
+ import zipfile
8
+ from pathlib import Path
9
+ from typing import List, Tuple, Optional, Dict, Any
10
+ import numpy as np
11
+ import faiss
12
+ from uuid import uuid4
13
+
14
+ from tools.document import Document
15
+
16
+
17
+ class InMemoryDocstore:
18
+ """Simple in-memory document store."""
19
+
20
+ def __init__(self):
21
+ self._dict: Dict[str, Document] = {}
22
+
23
+ def add(self, mapping: Dict[str, Document]):
24
+ """Add documents to the store."""
25
+ if not isinstance(self._dict, dict):
26
+ # Ensure _dict is a dictionary
27
+ if hasattr(self._dict, '_dict'):
28
+ self._dict = self._dict._dict
29
+ else:
30
+ self._dict = {}
31
+ self._dict.update(mapping)
32
+
33
+ def get(self, key: str) -> Optional[Document]:
34
+ """Get a document by key."""
35
+ if not isinstance(self._dict, dict):
36
+ # Ensure _dict is a dictionary
37
+ if hasattr(self._dict, '_dict'):
38
+ self._dict = self._dict._dict
39
+ else:
40
+ self._dict = {}
41
+ return self._dict.get(key)
42
+
43
+
44
+ class FAISS:
45
+ """FAISS vectorstore wrapper."""
46
+
47
+ def __init__(
48
+ self,
49
+ embedding_function,
50
+ index: Optional[faiss.Index] = None,
51
+ docstore: Optional[InMemoryDocstore] = None,
52
+ index_to_docstore_id: Optional[Dict[int, str]] = None
53
+ ):
54
+ self.embedding_function = embedding_function
55
+ self.index = index
56
+ self.docstore = docstore if docstore else InMemoryDocstore()
57
+ self.index_to_docstore_id = index_to_docstore_id if index_to_docstore_id else {}
58
+
59
+ @classmethod
60
+ def from_documents(
61
+ cls,
62
+ documents: List[Document],
63
+ embedding
64
+ ) -> "FAISS":
65
+ """Create a FAISS index from documents."""
66
+ if not documents:
67
+ raise ValueError("No documents provided")
68
+
69
+ # Generate embeddings
70
+ texts = [doc.page_content for doc in documents]
71
+ embeddings = embedding.embed_documents(texts)
72
+ embeddings_np = np.array(embeddings).astype("float32")
73
+
74
+ # Create FAISS index
75
+ dimension = embeddings_np.shape[1]
76
+ index = faiss.IndexFlatIP(dimension)
77
+ index.add(embeddings_np)
78
+
79
+ # Create docstore
80
+ docstore = InMemoryDocstore()
81
+ index_to_docstore_id = {}
82
+
83
+ for i, doc in enumerate(documents):
84
+ doc_id = str(uuid4())
85
+ docstore.add({doc_id: doc})
86
+ index_to_docstore_id[i] = doc_id
87
+
88
+ return cls(
89
+ embedding_function=embedding.embed_query,
90
+ index=index,
91
+ docstore=docstore,
92
+ index_to_docstore_id=index_to_docstore_id
93
+ )
94
+
95
+ def similarity_search_with_score(
96
+ self,
97
+ query: str,
98
+ k: int = 4
99
+ ) -> List[Tuple[Document, float]]:
100
+ """Search for similar documents with scores."""
101
+ if self.index is None:
102
+ return []
103
+
104
+ # Get query embedding
105
+ query_embedding = self.embedding_function(query)
106
+ query_vector = np.array([query_embedding]).astype("float32")
107
+
108
+ # Search
109
+ scores, indices = self.index.search(query_vector, k)
110
+
111
+ results = []
112
+ for score, idx in zip(scores[0], indices[0]):
113
+ if idx < 0: # FAISS returns -1 for invalid indices
114
+ continue
115
+ doc_id = self.index_to_docstore_id.get(idx)
116
+ if doc_id:
117
+ doc = self.docstore.get(doc_id)
118
+ if doc:
119
+ results.append((doc, float(score)))
120
+
121
+ return results
122
+
123
+ def save_local(self, folder_path: str):
124
+ """Save the FAISS index and docstore to disk."""
125
+ folder = Path(folder_path)
126
+ folder.mkdir(parents=True, exist_ok=True)
127
+
128
+ # Save FAISS index
129
+ faiss.write_index(self.index, str(folder / "index.faiss"))
130
+
131
+ # Save docstore and mapping
132
+ save_dict = {
133
+ "docstore": self.docstore._dict,
134
+ "index_to_docstore_id": self.index_to_docstore_id
135
+ }
136
+ with open(folder / "index.pkl", "wb") as f:
137
+ pickle.dump(save_dict, f)
138
+
139
+ @classmethod
140
+ def load_local(
141
+ cls,
142
+ folder_path: str,
143
+ embeddings,
144
+ allow_dangerous_deserialization: bool = False
145
+ ) -> "FAISS":
146
+ """Load a FAISS index from disk."""
147
+ if not allow_dangerous_deserialization:
148
+ raise ValueError("allow_dangerous_deserialization must be True to load pickled files")
149
+
150
+ folder = Path(folder_path)
151
+
152
+ # Load FAISS index
153
+ index = faiss.read_index(str(folder / "index.faiss"))
154
+
155
+ # Load docstore and mapping
156
+ with open(folder / "index.pkl", "rb") as f:
157
+ save_dict = pickle.load(f)
158
+
159
+ # Handle different pickle formats (dict or tuple)
160
+ if isinstance(save_dict, dict):
161
+ # Expected format: dictionary with keys
162
+ docstore_data = save_dict.get("docstore", {})
163
+ index_to_docstore_id = save_dict.get("index_to_docstore_id", {})
164
+ elif isinstance(save_dict, tuple):
165
+ # Legacy format: might be a tuple, try to unpack
166
+ # If tuple has 2 elements, assume (docstore_dict, index_to_docstore_id)
167
+ if len(save_dict) == 2:
168
+ docstore_data, index_to_docstore_id = save_dict
169
+ else:
170
+ raise ValueError(
171
+ f"Unexpected pickle format: tuple with {len(save_dict)} elements. "
172
+ f"Expected dictionary or tuple with 2 elements."
173
+ )
174
+ else:
175
+ raise TypeError(
176
+ f"Unexpected pickle format: {type(save_dict)}. "
177
+ f"Expected dictionary or tuple."
178
+ )
179
+
180
+ # Handle docstore_data - could be a dict or InMemoryDocstore object
181
+ docstore = InMemoryDocstore()
182
+ if isinstance(docstore_data, dict):
183
+ # It's a dictionary, use it directly
184
+ docstore._dict = docstore_data
185
+ elif isinstance(docstore_data, InMemoryDocstore):
186
+ # It's already an InMemoryDocstore object, copy its _dict
187
+ docstore._dict = docstore_data._dict.copy()
188
+ else:
189
+ # Try to convert to dict or raise error
190
+ raise TypeError(
191
+ f"Unexpected docstore format: {type(docstore_data)}. "
192
+ f"Expected dictionary or InMemoryDocstore object."
193
+ )
194
+
195
+ return cls(
196
+ embedding_function=embeddings.embed_query,
197
+ index=index,
198
+ docstore=docstore,
199
+ index_to_docstore_id=index_to_docstore_id
200
+ )
201
+
tools/ingest.py CHANGED
@@ -14,17 +14,18 @@ import zipfile
14
  import tempfile
15
  from pathlib import Path
16
 
17
- from langchain_huggingface.embeddings import HuggingFaceEmbeddings
18
- #from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
19
- from langchain_community.vectorstores.faiss import FAISS
20
- #from langchain_community.vectorstores import Chroma
21
- from langchain.text_splitter import RecursiveCharacterTextSplitter
22
- from langchain.docstore.document import Document
23
- #from chatfuncs.config import EMBEDDINGS_MODEL_NAME
24
- from langchain_core.embeddings import Embeddings # Import Embeddings for type hinting
25
  from tqdm import tqdm
26
- from langchain_community.docstore.in_memory import InMemoryDocstore # To manually build the docstore
27
  from uuid import uuid4 # To generate unique IDs for documents in the docstore
 
 
 
 
 
28
  from bs4 import BeautifulSoup
29
  from docx import Document as Doc
30
  from pypdf import PdfReader
@@ -695,7 +696,7 @@ def embed_faiss_save_to_zip(
695
  raw_faiss_index = faiss.IndexFlatIP(embedding_dimension)
696
  raw_faiss_index.add(embeddings_np) # Add all vectors to the raw FAISS index
697
 
698
- # 3. Create the LangChain FAISS Vectorstore from the components
699
  # The `embedding_function` is used for subsequent queries to the vectorstore,
700
  # not for building the initial index here (as we've already done that).
701
  vectorstore = FAISS(
@@ -703,7 +704,6 @@ def embed_faiss_save_to_zip(
703
  index=raw_faiss_index,
704
  docstore=docstore,
705
  index_to_docstore_id=index_to_docstore_id
706
- # distance_strategy defaults to COSINE, which is appropriate for IndexFlatIP
707
  )
708
  # --- Progress Bar Integration Ends Here ---
709
 
 
14
  import tempfile
15
  from pathlib import Path
16
 
17
+ from tools.embeddings import HuggingFaceEmbeddings
18
+ from tools.faiss_store import FAISS, InMemoryDocstore
19
+ from tools.text_splitter import RecursiveCharacterTextSplitter
20
+ from tools.document import Document
21
+ from typing import Protocol # For type hinting
 
 
 
22
  from tqdm import tqdm
 
23
  from uuid import uuid4 # To generate unique IDs for documents in the docstore
24
+
25
+ # Type hint for embeddings
26
+ class Embeddings(Protocol):
27
+ def embed_documents(self, texts: List[str]) -> List[List[float]]: ...
28
+ def embed_query(self, text: str) -> List[float]: ...
29
  from bs4 import BeautifulSoup
30
  from docx import Document as Doc
31
  from pypdf import PdfReader
 
696
  raw_faiss_index = faiss.IndexFlatIP(embedding_dimension)
697
  raw_faiss_index.add(embeddings_np) # Add all vectors to the raw FAISS index
698
 
699
+ # 3. Create the FAISS Vectorstore from the components
700
  # The `embedding_function` is used for subsequent queries to the vectorstore,
701
  # not for building the initial index here (as we've already done that).
702
  vectorstore = FAISS(
 
704
  index=raw_faiss_index,
705
  docstore=docstore,
706
  index_to_docstore_id=index_to_docstore_id
 
707
  )
708
  # --- Progress Bar Integration Ends Here ---
709
 
tools/text_splitter.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom text splitter to replace langchain RecursiveCharacterTextSplitter.
3
+ """
4
+ from typing import List, Optional, Callable
5
+ import re
6
+
7
+
8
+ class RecursiveCharacterTextSplitter:
9
+ """Splits text recursively by characters."""
10
+
11
+ def __init__(
12
+ self,
13
+ chunk_size: int = 1000,
14
+ chunk_overlap: int = 200,
15
+ separators: Optional[List[str]] = None,
16
+ length_function: Optional[Callable[[str], int]] = None,
17
+ add_start_index: bool = False
18
+ ):
19
+ self.chunk_size = chunk_size
20
+ self.chunk_overlap = chunk_overlap
21
+ self.separators = separators if separators else ["\n\n", "\n", ". ", "! ", "? ", " ", ""]
22
+ self.length_function = length_function if length_function else len
23
+ self.add_start_index = add_start_index
24
+
25
+ def split_text(self, text: str) -> List[str]:
26
+ """Split text into chunks."""
27
+ if not text:
28
+ return []
29
+
30
+ # Start with the full text
31
+ splits = [text]
32
+
33
+ # Try each separator in order
34
+ for separator in self.separators:
35
+ if not separator:
36
+ # Last separator - split by character
37
+ new_splits = []
38
+ for split in splits:
39
+ if self.length_function(split) <= self.chunk_size:
40
+ new_splits.append(split)
41
+ else:
42
+ # Split by character
43
+ for i in range(0, len(split), self.chunk_size - self.chunk_overlap):
44
+ chunk = split[i:i + self.chunk_size]
45
+ if chunk:
46
+ new_splits.append(chunk)
47
+ splits = new_splits
48
+ break
49
+
50
+ new_splits = []
51
+ for split in splits:
52
+ if self.length_function(split) <= self.chunk_size:
53
+ new_splits.append(split)
54
+ else:
55
+ # Split by separator
56
+ parts = split.split(separator)
57
+ current_chunk = ""
58
+ for part in parts:
59
+ part_with_sep = part if not current_chunk else separator + part
60
+ if self.length_function(current_chunk + part_with_sep) <= self.chunk_size:
61
+ current_chunk += part_with_sep
62
+ else:
63
+ if current_chunk:
64
+ new_splits.append(current_chunk)
65
+ current_chunk = part_with_sep
66
+ if current_chunk:
67
+ new_splits.append(current_chunk)
68
+ splits = new_splits
69
+
70
+ # If all splits are small enough, we're done
71
+ if all(self.length_function(s) <= self.chunk_size for s in splits):
72
+ break
73
+
74
+ # Apply overlap
75
+ if self.chunk_overlap > 0 and len(splits) > 1:
76
+ overlapped_splits = []
77
+ for i, split in enumerate(splits):
78
+ if i == 0:
79
+ overlapped_splits.append(split)
80
+ else:
81
+ # Add overlap from previous chunk
82
+ prev_chunk = splits[i - 1]
83
+ overlap_text = prev_chunk[-self.chunk_overlap:] if len(prev_chunk) > self.chunk_overlap else prev_chunk
84
+ overlapped_splits.append(overlap_text + split)
85
+ splits = overlapped_splits
86
+
87
+ return splits
88
+
89
+ def create_documents(
90
+ self,
91
+ texts: List[str],
92
+ metadatas: Optional[List[dict]] = None
93
+ ) -> List:
94
+ """Create Document objects from texts."""
95
+ from tools.document import Document
96
+
97
+ all_docs = []
98
+ metadatas = metadatas if metadatas else [{}] * len(texts)
99
+
100
+ for text, metadata in zip(texts, metadatas):
101
+ splits = self.split_text(text)
102
+ for i, split in enumerate(splits):
103
+ doc_metadata = metadata.copy()
104
+ if self.add_start_index:
105
+ # Find start index in original text
106
+ start_idx = text.find(split)
107
+ if start_idx != -1:
108
+ doc_metadata["start_index"] = start_idx
109
+ all_docs.append(Document(page_content=split, metadata=doc_metadata))
110
+
111
+ return all_docs
112
+