daniel-was-taken commited on
Commit
de38977
·
1 Parent(s): cf1fb02

Change to Chatbot

Browse files
.chainlit/config.toml CHANGED
@@ -1,8 +1,4 @@
1
  [project]
2
- # Whether to enable telemetry (default: false). No personal data is collected.
3
- enable_telemetry = false
4
-
5
-
6
  # List of environment variables to be provided by each user to use the app.
7
  user_env = []
8
 
@@ -34,6 +30,10 @@ auto_tag_thread = true
34
  # Allow users to edit their own messages
35
  edit_message = true
36
 
 
 
 
 
37
  # Authorize users to spontaneously upload files with messages
38
  [features.spontaneous_file_upload]
39
  enabled = false
@@ -57,6 +57,9 @@ edit_message = true
57
  [features.mcp.sse]
58
  enabled = true
59
 
 
 
 
60
  [features.mcp.stdio]
61
  enabled = true
62
  # Only the executables in the allow list can be used for MCP stdio server.
@@ -106,13 +109,13 @@ alert_style = "classic"
106
 
107
 
108
  # Specify a custom meta image url.
109
- # custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
110
 
111
  # Load assistant logo directly from URL.
112
- logo_file_url = ""
113
 
114
  # Load assistant avatar image directly from URL.
115
- default_avatar_file_url = ""
116
 
117
  # Specify a custom build directory for the frontend.
118
  # This can be used to customize the frontend code.
@@ -127,4 +130,4 @@ default_avatar_file_url = ""
127
  # url = "https://github.com/Chainlit/chainlit/issues"
128
 
129
  [meta]
130
- generated_by = "2.6.0"
 
1
  [project]
 
 
 
 
2
  # List of environment variables to be provided by each user to use the app.
3
  user_env = []
4
 
 
30
  # Allow users to edit their own messages
31
  edit_message = true
32
 
33
+ [features.slack]
34
+ # Add emoji reaction when message is received (requires reactions:write OAuth scope)
35
+ reaction_on_message_received = false
36
+
37
  # Authorize users to spontaneously upload files with messages
38
  [features.spontaneous_file_upload]
39
  enabled = false
 
57
  [features.mcp.sse]
58
  enabled = true
59
 
60
+ [features.mcp.streamable-http]
61
+ enabled = true
62
+
63
  [features.mcp.stdio]
64
  enabled = true
65
  # Only the executables in the allow list can be used for MCP stdio server.
 
109
 
110
 
111
  # Specify a custom meta image url.
112
+ custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
113
 
114
  # Load assistant logo directly from URL.
115
+ logo_file_url = "public/favicon.svg"
116
 
117
  # Load assistant avatar image directly from URL.
118
+ default_avatar_file_url = "public/favicon.svg"
119
 
120
  # Specify a custom build directory for the frontend.
121
  # This can be used to customize the frontend code.
 
130
  # url = "https://github.com/Chainlit/chainlit/issues"
131
 
132
  [meta]
133
+ generated_by = "2.6.9"
.chainlit/old_config.toml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: false). No personal data is collected.
3
+ enable_telemetry = false
4
+
5
+
6
+ # List of environment variables to be provided by each user to use the app.
7
+ user_env = []
8
+
9
+ # Duration (in seconds) during which the session is saved when the connection is lost
10
+ session_timeout = 3600
11
+
12
+ # Duration (in seconds) of the user session expiry
13
+ user_session_timeout = 1296000 # 15 days
14
+
15
+ # Enable third parties caching (e.g., LangChain cache)
16
+ cache = false
17
+
18
+ # Authorized origins
19
+ allow_origins = ["*"]
20
+
21
+ [features]
22
+ # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
23
+ unsafe_allow_html = false
24
+
25
+ # Process and display mathematical expressions. This can clash with "$" characters in messages.
26
+ latex = false
27
+
28
+ # Autoscroll new user messages at the top of the window
29
+ user_message_autoscroll = true
30
+
31
+ # Automatically tag threads with the current chat profile (if a chat profile is used)
32
+ auto_tag_thread = true
33
+
34
+ # Allow users to edit their own messages
35
+ edit_message = true
36
+
37
+ # Authorize users to spontaneously upload files with messages
38
+ [features.spontaneous_file_upload]
39
+ enabled = false
40
+ # Define accepted file types using MIME types
41
+ # Examples:
42
+ # 1. For specific file types:
43
+ # accept = ["image/jpeg", "image/png", "application/pdf"]
44
+ # 2. For all files of certain type:
45
+ # accept = ["image/*", "audio/*", "video/*"]
46
+ # 3. For specific file extensions:
47
+ # accept = { "application/octet-stream" = [".xyz", ".pdb"] }
48
+ # Note: Using "*/*" is not recommended as it may cause browser warnings
49
+ accept = ["*/*"]
50
+ max_files = 20
51
+ max_size_mb = 500
52
+
53
+ [features.audio]
54
+ # Sample rate of the audio
55
+ sample_rate = 24000
56
+
57
+ [features.mcp.sse]
58
+ enabled = true
59
+
60
+ [features.mcp.stdio]
61
+ enabled = true
62
+ # Only the executables in the allow list can be used for MCP stdio server.
63
+ # Only need the base name of the executable, e.g. "npx", not "/usr/bin/npx".
64
+ # Please don't comment this line for now, we need it to parse the executable name.
65
+ allowed_executables = [ "npx", "uvx" ]
66
+
67
+ [UI]
68
+ # Name of the assistant.
69
+ name = "Assistant"
70
+
71
+ # default_theme = "dark"
72
+
73
+ # layout = "wide"
74
+
75
+ # default_sidebar_state = "open"
76
+
77
+ # Description of the assistant. This is used for HTML tags.
78
+ # description = ""
79
+
80
+ # Chain of Thought (CoT) display mode. Can be "hidden", "tool_call" or "full".
81
+ cot = "full"
82
+
83
+ # Specify a CSS file that can be used to customize the user interface.
84
+ # The CSS file can be served from the public directory or via an external link.
85
+ # custom_css = "/public/test.css"
86
+
87
+ # Specify additional attributes for a custom CSS file
88
+ # custom_css_attributes = "media=\"print\""
89
+
90
+ # Specify a JavaScript file that can be used to customize the user interface.
91
+ # The JavaScript file can be served from the public directory.
92
+ # custom_js = "/public/test.js"
93
+
94
+ # The style of alert boxes. Can be "classic" or "modern".
95
+ alert_style = "classic"
96
+
97
+ # Specify additional attributes for custom JS file
98
+ # custom_js_attributes = "async type = \"module\""
99
+
100
+ # Custom login page image, relative to public directory or external URL
101
+ # login_page_image = "/public/custom-background.jpg"
102
+
103
+ # Custom login page image filter (Tailwind internal filters, no dark/light variants)
104
+ login_page_image_filter = "brightness-50 grayscale"
105
+ login_page_image_dark_filter = "contrast-200 blur-sm"
106
+
107
+
108
+ # Specify a custom meta image url.
109
+ custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
110
+
111
+ # Load assistant logo directly from URL.
112
+ logo_file_url = "https://avatars.githubusercontent.com/u/128686189?s=200&v=4"
113
+
114
+ # Load assistant avatar image directly from URL.
115
+ default_avatar_file_url = "https://avatars.githubusercontent.com/u/128686189?s=200&v=4"
116
+
117
+ # Specify a custom build directory for the frontend.
118
+ # This can be used to customize the frontend code.
119
+ # Be careful: If this is a relative path, it should not start with a slash.
120
+ # custom_build = "./public/build"
121
+
122
+ # Specify optional one or more custom links in the header.
123
+ # [[UI.header_links]]
124
+ # name = "Issues"
125
+ # display_name = "Report Issue"
126
+ # icon_url = "https://avatars.githubusercontent.com/u/128686189?s=200&v=4"
127
+ # url = "https://github.com/Chainlit/chainlit/issues"
128
+
129
+ [meta]
130
+ generated_by = "2.6.0"
.gitignore CHANGED
@@ -225,4 +225,6 @@ secrets/
225
  volumes/
226
 
227
  simple_analysis.py
228
- # This file is used for simple analysis of the codebase, such as checking for unused imports or variables.
 
 
 
225
  volumes/
226
 
227
  simple_analysis.py
228
+ # This file is used for simple analysis of the codebase, such as checking for unused imports or variables.
229
+
230
+ RAGAS_test_details/
old_prisma_compose.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ postgres:
3
+ image: postgres:16
4
+ volumes:
5
+ - ./.data/postgres:/var/lib/postgresql/data
6
+ environment:
7
+ - POSTGRES_USER=${POSTGRES_USER:-root}
8
+ - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-root}
9
+ - POSTGRES_DB=${POSTGRES_DB:-postgres}
10
+ ports:
11
+ - ${POSTGRES_PORT:-5432}:5432
12
+ localstack:
13
+ image: localstack/localstack:latest
14
+ environment:
15
+ SERVICES: s3
16
+ ports:
17
+ - 4566:4566
18
+ volumes:
19
+ - ./localstack-script.sh:/etc/localstack/init/ready.d/script.sh
20
+ - "/var/run/docker.sock:/var/run/docker.sock"
21
+ # Uncomment below to simulate Azure Blob Storage (don't forget to run the init_azure_storage.py to create the container)
22
+ # azurite:
23
+ # image: mcr.microsoft.com/azure-storage/azurite
24
+ # ports:
25
+ # - "10000:10000" # Blob service
26
+ # - "10001:10001" # Queue service
27
+ # - "10002:10002" # Table service
28
+ # volumes:
29
+ # - ./.data/azurite:/data
30
+ # command: "azurite --blobHost 0.0.0.0 --queueHost 0.0.0.0 --tableHost 0.0.0.0"
populate_db.py CHANGED
@@ -1,94 +1,113 @@
1
- import time
2
  import os
 
3
  from pathlib import Path
 
4
  from dotenv import load_dotenv
5
- from unstructured.cleaners.core import clean_extra_whitespace, replace_unicode_quotes, clean_dashes, group_broken_paragraphs
6
- from langchain_unstructured import UnstructuredLoader
7
- from sentence_transformers import SentenceTransformer
8
- from pymilvus import MilvusClient, DataType
9
  from langchain_nebius import NebiusEmbeddings
 
10
  from pydantic import SecretStr
11
- import os
 
 
 
 
12
 
13
- # Load environment variables from .env file
14
  load_dotenv()
15
 
16
- # Initialize Milvus client and collection setup
17
  MILVUS_URI = os.getenv("MILVUS_URI", "http://localhost:19530")
 
 
 
 
 
 
 
 
 
 
 
 
18
  milvus_client = MilvusClient(uri=MILVUS_URI)
19
- collection_name = "my_rag_collection"
20
 
21
- # Initialize embedding model
22
- # embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
23
- # embedding_model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
24
  embedding_model = NebiusEmbeddings(
25
  api_key=SecretStr(os.getenv("NEBIUS_API_KEY", os.getenv("OPENAI_API_KEY"))),
26
  model="Qwen/Qwen3-Embedding-8B",
27
  base_url="https://api.studio.nebius.ai/v1"
28
  )
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
- def emb_text(text):
33
- """Generate embeddings for text using the sentence transformer model."""
34
  return embedding_model.embed_query(text)
35
- # return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]
36
 
37
- def emb_text_batch(texts):
38
- """Generate embeddings for multiple texts in batch - more efficient."""
 
39
  return embedding_model.embed_documents(texts)
40
 
41
- def process_embeddings_in_batches(texts_to_embed, batch_size=50):
42
- """Process embeddings in batches with error handling and fallback."""
 
43
  all_embeddings = []
 
44
 
45
- print(f"Generating embeddings in batches of {batch_size}...")
46
 
47
- for i in range(0, len(texts_to_embed), batch_size):
48
- batch_texts = texts_to_embed[i:i + batch_size]
49
- batch_end = min(i + batch_size, len(texts_to_embed))
50
 
51
- print(f"Processing embedding batch {i//batch_size + 1}/{(len(texts_to_embed) + batch_size - 1)//batch_size} (documents {i+1}-{batch_end})")
52
 
53
  try:
54
- batch_embeddings = emb_text_batch(batch_texts)
55
  all_embeddings.extend(batch_embeddings)
56
-
57
- # Add a small delay between batches to be respectful to the API
58
- time.sleep(1.5)
59
 
60
  except Exception as e:
61
- print(f"Error processing batch {i//batch_size + 1}: {e}")
62
- print("Falling back to individual processing for this batch...")
63
 
64
- # Fallback to individual processing for this batch
65
  for j, text in enumerate(batch_texts):
66
  try:
67
- embedding = emb_text(text)
68
  all_embeddings.append(embedding)
69
- print(f" Individual embedding {i+j+1} completed")
70
- time.sleep(2) # Longer delay for individual requests
71
  except Exception as individual_error:
72
- print(f" Failed to process document {i+j+1}: {individual_error}")
73
- # Use a zero vector as fallback
74
- all_embeddings.append([0.0] * 4096)
75
 
76
  return all_embeddings
77
 
78
  def create_collection():
79
- """Create collection if it doesn't exist."""
80
- if milvus_client.has_collection(collection_name):
81
- milvus_client.load_collection(collection_name=collection_name)
82
  return
83
 
84
- # Create Milvus collection schema
85
  schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
86
  schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
87
- schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=4096) # Qwen/Qwen3-Embedding-8B dimension
88
- schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535) # Maximum allowed for VARCHAR
89
  schema.add_field(field_name="metadata", datatype=DataType.JSON)
90
 
91
- # Create index for vector search
92
  index_params = MilvusClient.prepare_index_params()
93
  index_params.add_index(
94
  field_name="vector",
@@ -98,143 +117,191 @@ def create_collection():
98
 
99
  # Create and load collection
100
  milvus_client.create_collection(
101
- collection_name=collection_name,
102
  schema=schema,
103
  index_params=index_params,
104
  consistency_level="Strong",
105
  )
106
- milvus_client.load_collection(collection_name=collection_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- # Document directory
109
- directory_path = "data/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def main():
112
- """Main function to load documents and insert them into Milvus."""
113
  create_collection()
114
 
115
  # Check if collection already has data
116
- stats = milvus_client.get_collection_stats(collection_name)
117
  if stats['row_count'] > 0:
118
  print(f"Collection already contains {stats['row_count']} documents. Skipping insertion.")
119
  return
120
 
121
- docs = unstructured_document_loader()
 
 
 
 
122
 
123
- # Process documents in small chunks to avoid memory issues on 4GB droplet
124
- chunk_size = 100 # Very conservative chunk size for 4GB memory
125
  total_docs = len(docs)
126
- total_chunks = (total_docs + chunk_size - 1) // chunk_size
127
-
128
- print(f"🔧 Memory-efficient processing: {total_docs} documents in {total_chunks} chunks of {chunk_size}")
129
- print("📊 This approach prevents OOM kills on your 4GB DigitalOcean droplet")
130
-
131
  total_inserted = 0
132
 
133
- for chunk_idx in range(0, total_docs, chunk_size):
134
- chunk_end = min(chunk_idx + chunk_size, total_docs)
135
- chunk_num = chunk_idx // chunk_size + 1
136
-
137
- print(f"\n{'='*40}")
138
- print(f"CHUNK {chunk_num}/{total_chunks} | Docs {chunk_idx + 1}-{chunk_end}")
139
- print(f"{'='*40}")
140
-
141
- # Get current chunk of documents
142
  current_chunk = docs[chunk_idx:chunk_end]
143
 
144
- # Process this chunk
145
- texts_to_embed = []
146
- doc_data = []
147
-
148
- for i, doc in enumerate(current_chunk):
149
- text_content = doc.page_content
150
- if len(text_content) > 65000:
151
- text_content = text_content[:65000]
152
- print(f"📄 Doc {chunk_idx + i + 1} truncated: {len(doc.page_content)} → {len(text_content)} chars")
153
-
154
- texts_to_embed.append(text_content)
155
- doc_data.append({
156
- "id": chunk_idx + i,
157
- "text": text_content,
158
- "metadata": doc.metadata if doc.metadata else {}
159
- })
160
-
161
- # Generate embeddings with small batch size
162
- print(f"🚀 Generating embeddings for {len(texts_to_embed)} documents...")
163
- all_embeddings = process_embeddings_in_batches(texts_to_embed, batch_size=5) # Very small batches
164
-
165
- # Prepare and insert data
166
- data_to_insert = []
167
- for doc_info, embedding in zip(doc_data, all_embeddings):
168
- data_to_insert.append({
169
- "id": doc_info["id"],
170
- "vector": embedding,
171
- "text": doc_info["text"],
172
- "metadata": doc_info["metadata"]
173
- })
174
-
175
- # Insert to Milvus
176
- insert_result = milvus_client.insert(collection_name=collection_name, data=data_to_insert)
177
- chunk_inserted = insert_result['insert_count']
178
  total_inserted += chunk_inserted
179
 
180
- print(f"Chunk {chunk_num} complete: {chunk_inserted} docs inserted")
181
- print(f"📈 Overall progress: {total_inserted}/{total_docs} ({(total_inserted/total_docs)*100:.1f}%)")
182
 
183
- # Critical: Free memory before next chunk
184
- del texts_to_embed, doc_data, all_embeddings, data_to_insert, current_chunk
185
-
186
- # Brief pause between chunks
187
  if chunk_num < total_chunks:
188
- print("⏱️ Memory cleanup pause (2s)...")
189
  time.sleep(2)
190
 
191
- print(f"\n🎉 SUCCESS! All {total_inserted} documents processed and inserted!")
192
- return docs
193
 
194
- def unstructured_document_loader():
195
- """Load documents using UnstructuredLoader."""
196
- # Collect file paths for PDF, DOCX, and HTML files
197
- file_extensions = ["*.pdf", "*.docx", "*.html"]
198
- file_paths = []
199
-
200
- for ext in file_extensions:
201
- file_paths.extend(Path(directory_path).glob(ext))
202
-
203
- # Convert Path objects to strings
204
- file_paths = [str(file) for file in file_paths]
205
-
206
- # Configure UnstructuredLoader with post-processors
207
- loader = UnstructuredLoader(
208
- file_paths,
209
- chunking_strategy="by_title",
210
- include_orig_elements=False,
211
- post_processors=[
212
- clean_extra_whitespace,
213
- replace_unicode_quotes,
214
- clean_dashes,
215
- group_broken_paragraphs
216
- ]
217
- )
218
-
219
- docs = loader.load()
220
- print(f"Number of LangChain documents: {len(docs)}")
221
- print(f"Length of first document: {len(docs[0].page_content)} characters")
222
- print(f"First document preview: {docs[0].page_content[:200]}...")
223
-
224
- return docs
225
 
226
  def verify_insertion():
227
  """Verify that data was successfully inserted into Milvus."""
228
- # Get collection statistics
229
- stats = milvus_client.get_collection_stats(collection_name)
230
  print(f"Collection stats: {stats}")
231
 
232
- # Test search functionality with a sample query
233
- test_query = "Questions by staff to other staff"
234
- test_embedding = emb_text(test_query)
235
 
236
  search_results = milvus_client.search(
237
- collection_name=collection_name,
238
  data=[test_embedding],
239
  limit=3,
240
  output_fields=["text", "metadata"]
@@ -252,17 +319,14 @@ def verify_insertion():
252
  if __name__ == "__main__":
253
  start_time = time.time()
254
 
255
- print("="*60)
256
- print("STARTING DOCUMENT PROCESSING AND MILVUS INSERTION")
257
- print("="*60)
258
 
259
  main()
260
 
261
- print("\n" + "="*50)
262
- print("VERIFYING DATA INSERTION")
263
- print("="*50)
264
  verify_insertion()
265
 
266
- end_time = time.time()
267
- elapsed_time = end_time - start_time
268
  print(f"\nTotal execution time: {elapsed_time:.2f} seconds")
 
 
1
  import os
2
+ import time
3
  from pathlib import Path
4
+
5
  from dotenv import load_dotenv
 
 
 
 
6
  from langchain_nebius import NebiusEmbeddings
7
+ from langchain_unstructured import UnstructuredLoader
8
  from pydantic import SecretStr
9
+ from pymilvus import MilvusClient, DataType
10
+ from unstructured.cleaners.core import (
11
+ clean_extra_whitespace,
12
+ replace_unicode_quotes
13
+ )
14
 
15
+ # Load environment variables
16
  load_dotenv()
17
 
18
+ # Configuration constants
19
  MILVUS_URI = os.getenv("MILVUS_URI", "http://localhost:19530")
20
+ COLLECTION_NAME = "my_rag_collection"
21
+ DOCUMENT_DIR = "data/"
22
+ EMBEDDING_DIMENSION = 4096
23
+ TEXT_MAX_LENGTH = 65000
24
+ CHUNK_SIZE = 100
25
+ BATCH_SIZE = 5
26
+
27
+ # Chunking configuration
28
+ MAX_CHARACTERS = 1500
29
+ COMBINE_TEXT_UNDER_N_CHARS = 200
30
+
31
+ # Initialize clients
32
  milvus_client = MilvusClient(uri=MILVUS_URI)
 
33
 
 
 
 
34
  embedding_model = NebiusEmbeddings(
35
  api_key=SecretStr(os.getenv("NEBIUS_API_KEY", os.getenv("OPENAI_API_KEY"))),
36
  model="Qwen/Qwen3-Embedding-8B",
37
  base_url="https://api.studio.nebius.ai/v1"
38
  )
39
 
40
+ def clean_text(text):
41
+ """Simple text cleaning for educational documents."""
42
+ import re
43
+
44
+ # Basic cleaning without problematic functions
45
+ text = clean_extra_whitespace(text)
46
+ text = replace_unicode_quotes(text)
47
+
48
+ # Simple normalizations
49
+ text = re.sub(r'[\r\n]+', ' ', text) # Convert newlines to spaces
50
+ text = re.sub(r'\s+', ' ', text) # Multiple spaces to single space
51
+
52
+ return text.strip()
53
 
54
 
55
+ def generate_embedding(text):
56
+ """Generate embedding for a single text."""
57
  return embedding_model.embed_query(text)
 
58
 
59
+
60
+ def generate_embeddings_batch(texts):
61
+ """Generate embeddings for multiple texts efficiently."""
62
  return embedding_model.embed_documents(texts)
63
 
64
+
65
+ def process_embeddings_in_batches(texts, batch_size=BATCH_SIZE):
66
+ """Process embeddings in batches with error handling."""
67
  all_embeddings = []
68
+ total_batches = (len(texts) + batch_size - 1) // batch_size
69
 
70
+ print(f"Generating embeddings in {total_batches} batches of {batch_size}...")
71
 
72
+ for i in range(0, len(texts), batch_size):
73
+ batch_texts = texts[i:i + batch_size]
74
+ batch_num = i // batch_size + 1
75
 
76
+ print(f"Processing batch {batch_num}/{total_batches}")
77
 
78
  try:
79
+ batch_embeddings = generate_embeddings_batch(batch_texts)
80
  all_embeddings.extend(batch_embeddings)
81
+ time.sleep(1.5) # API rate limiting
 
 
82
 
83
  except Exception as e:
84
+ print(f"Batch {batch_num} failed: {e}. Processing individually...")
 
85
 
 
86
  for j, text in enumerate(batch_texts):
87
  try:
88
+ embedding = generate_embedding(text)
89
  all_embeddings.append(embedding)
90
+ time.sleep(2)
 
91
  except Exception as individual_error:
92
+ print(f"Failed to process document {i+j+1}: {individual_error}")
93
+ all_embeddings.append([0.0] * EMBEDDING_DIMENSION)
 
94
 
95
  return all_embeddings
96
 
97
  def create_collection():
98
+ """Create Milvus collection if it doesn't exist."""
99
+ if milvus_client.has_collection(COLLECTION_NAME):
100
+ milvus_client.load_collection(collection_name=COLLECTION_NAME)
101
  return
102
 
103
+ # Create collection schema
104
  schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
105
  schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
106
+ schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIMENSION)
107
+ schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535)
108
  schema.add_field(field_name="metadata", datatype=DataType.JSON)
109
 
110
+ # Create vector index
111
  index_params = MilvusClient.prepare_index_params()
112
  index_params.add_index(
113
  field_name="vector",
 
117
 
118
  # Create and load collection
119
  milvus_client.create_collection(
120
+ collection_name=COLLECTION_NAME,
121
  schema=schema,
122
  index_params=index_params,
123
  consistency_level="Strong",
124
  )
125
+ milvus_client.load_collection(collection_name=COLLECTION_NAME)
126
+
127
+ def load_documents():
128
+ """Load documents from the data directory."""
129
+ file_extensions = ["*.pdf", "*.docx", "*.html"]
130
+ file_paths = []
131
+
132
+ for ext in file_extensions:
133
+ file_paths.extend(Path(DOCUMENT_DIR).glob(ext))
134
+
135
+ file_paths = [str(file) for file in file_paths]
136
+
137
+ loader = UnstructuredLoader(
138
+ file_paths,
139
+ chunking_strategy="by_title",
140
+ include_orig_elements=False
141
+ )
142
+
143
+ docs = loader.load()
144
+ print(f"Loaded {len(docs)} initial documents")
145
+
146
+ # Apply additional cleaning and chunking
147
+ final_chunks = []
148
+
149
+ for doc in docs:
150
+ # Clean text
151
+ cleaned_text = clean_text(doc.page_content)
152
+
153
+ # Skip very short chunks
154
+ if len(cleaned_text) < 50:
155
+ continue
156
+
157
+ # Split if too large
158
+ if len(cleaned_text) <= MAX_CHARACTERS:
159
+ doc.page_content = cleaned_text
160
+ final_chunks.append(doc)
161
+ else:
162
+ # Split large chunks on sentence boundaries
163
+ chunks = _split_large_chunk(cleaned_text, doc.metadata)
164
+ final_chunks.extend(chunks)
165
+
166
+ print(f"Final processed chunks: {len(final_chunks)}")
167
+ if final_chunks:
168
+ avg_length = sum(len(doc.page_content) for doc in final_chunks) / len(final_chunks)
169
+ print(f"Average chunk length: {avg_length:.0f} characters")
170
+
171
+ return final_chunks
172
+
173
+
174
+ def _split_large_chunk(text, metadata):
175
+ """Split large text into smaller chunks."""
176
+ from langchain.schema import Document
177
+
178
+ chunks = []
179
+ sentences = text.split('. ')
180
+ current_chunk = ""
181
+
182
+ for sentence in sentences:
183
+ potential_chunk = current_chunk + sentence + '. '
184
+
185
+ if len(potential_chunk) > MAX_CHARACTERS and len(current_chunk) > COMBINE_TEXT_UNDER_N_CHARS:
186
+ if current_chunk.strip():
187
+ chunks.append(Document(
188
+ page_content=current_chunk.strip(),
189
+ metadata=metadata.copy()
190
+ ))
191
+ current_chunk = sentence + '. '
192
+ else:
193
+ current_chunk = potential_chunk
194
+
195
+ # Add remaining content
196
+ if current_chunk.strip():
197
+ chunks.append(Document(
198
+ page_content=current_chunk.strip(),
199
+ metadata=metadata.copy()
200
+ ))
201
+
202
+ return chunks
203
+
204
+
205
+ def prepare_document_data(docs, start_idx=0):
206
+ """Prepare document data for insertion."""
207
+ texts_to_embed = []
208
+ doc_data = []
209
+
210
+ for i, doc in enumerate(docs):
211
+ text_content = doc.page_content
212
+ if len(text_content) > TEXT_MAX_LENGTH:
213
+ text_content = text_content[:TEXT_MAX_LENGTH]
214
+ print(f"Document {start_idx + i + 1} truncated to {TEXT_MAX_LENGTH} characters")
215
+
216
+ texts_to_embed.append(text_content)
217
+ doc_data.append({
218
+ "id": start_idx + i,
219
+ "text": text_content,
220
+ "metadata": doc.metadata or {}
221
+ })
222
+
223
+ return texts_to_embed, doc_data
224
+
225
 
226
+ def process_document_chunk(docs, chunk_idx, chunk_num, total_chunks):
227
+ """Process a single chunk of documents."""
228
+ print(f"\nProcessing chunk {chunk_num}/{total_chunks}")
229
+
230
+ # Prepare document data
231
+ texts_to_embed, doc_data = prepare_document_data(docs, chunk_idx)
232
+
233
+ # Generate embeddings
234
+ print(f"Generating embeddings for {len(texts_to_embed)} documents...")
235
+ embeddings = process_embeddings_in_batches(texts_to_embed)
236
+
237
+ # Prepare data for insertion
238
+ data_to_insert = []
239
+ for doc_info, embedding in zip(doc_data, embeddings):
240
+ data_to_insert.append({
241
+ "id": doc_info["id"],
242
+ "vector": embedding,
243
+ "text": doc_info["text"],
244
+ "metadata": doc_info["metadata"]
245
+ })
246
+
247
+ # Insert into Milvus
248
+ insert_result = milvus_client.insert(collection_name=COLLECTION_NAME, data=data_to_insert)
249
+ return insert_result['insert_count']
250
 
251
  def main():
252
+ """Main function to process and insert documents into Milvus."""
253
  create_collection()
254
 
255
  # Check if collection already has data
256
+ stats = milvus_client.get_collection_stats(COLLECTION_NAME)
257
  if stats['row_count'] > 0:
258
  print(f"Collection already contains {stats['row_count']} documents. Skipping insertion.")
259
  return
260
 
261
+ # Load documents
262
+ docs = load_documents()
263
+ if not docs:
264
+ print("No documents found to process.")
265
+ return
266
 
267
+ # Process documents in chunks
 
268
  total_docs = len(docs)
269
+ total_chunks = (total_docs + CHUNK_SIZE - 1) // CHUNK_SIZE
 
 
 
 
270
  total_inserted = 0
271
 
272
+ print(f"Processing {total_docs} documents in {total_chunks} chunks of {CHUNK_SIZE}")
273
+
274
+ for chunk_idx in range(0, total_docs, CHUNK_SIZE):
275
+ chunk_end = min(chunk_idx + CHUNK_SIZE, total_docs)
276
+ chunk_num = chunk_idx // CHUNK_SIZE + 1
 
 
 
 
277
  current_chunk = docs[chunk_idx:chunk_end]
278
 
279
+ # Process chunk
280
+ chunk_inserted = process_document_chunk(current_chunk, chunk_idx, chunk_num, total_chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  total_inserted += chunk_inserted
282
 
283
+ print(f"Chunk {chunk_num} complete: {chunk_inserted} docs inserted")
284
+ print(f"Progress: {total_inserted}/{total_docs} ({(total_inserted/total_docs)*100:.1f}%)")
285
 
286
+ # Memory cleanup
287
+ del current_chunk
 
 
288
  if chunk_num < total_chunks:
 
289
  time.sleep(2)
290
 
291
+ print(f"\nSuccessfully processed {total_inserted} documents!")
 
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
  def verify_insertion():
295
  """Verify that data was successfully inserted into Milvus."""
296
+ stats = milvus_client.get_collection_stats(COLLECTION_NAME)
 
297
  print(f"Collection stats: {stats}")
298
 
299
+ # Test search functionality
300
+ test_query = "Why should reasonable adjustments be made?"
301
+ test_embedding = generate_embedding(test_query)
302
 
303
  search_results = milvus_client.search(
304
+ collection_name=COLLECTION_NAME,
305
  data=[test_embedding],
306
  limit=3,
307
  output_fields=["text", "metadata"]
 
319
  if __name__ == "__main__":
320
  start_time = time.time()
321
 
322
+ print("Starting document processing and Milvus insertion")
323
+ print("=" * 60)
 
324
 
325
  main()
326
 
327
+ print("\nVerifying data insertion")
328
+ print("=" * 30)
 
329
  verify_insertion()
330
 
331
+ elapsed_time = time.time() - start_time
 
332
  print(f"\nTotal execution time: {elapsed_time:.2f} seconds")
prisma/migrations/20250103173917_init_data_layer/migration.sql ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- CreateExtension
2
+ CREATE EXTENSION IF NOT EXISTS "pgcrypto";
3
+
4
+ -- CreateEnum
5
+ CREATE TYPE "StepType" AS ENUM ('assistant_message', 'embedding', 'llm', 'retrieval', 'rerank', 'run', 'system_message', 'tool', 'undefined', 'user_message');
6
+
7
+ -- CreateTable
8
+ CREATE TABLE "Element" (
9
+ "id" TEXT NOT NULL DEFAULT gen_random_uuid(),
10
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
11
+ "updatedAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
12
+ "threadId" TEXT,
13
+ "stepId" TEXT NOT NULL,
14
+ "metadata" JSONB NOT NULL,
15
+ "mime" TEXT,
16
+ "name" TEXT NOT NULL,
17
+ "objectKey" TEXT,
18
+ "url" TEXT,
19
+ "chainlitKey" TEXT,
20
+ "display" TEXT,
21
+ "size" TEXT,
22
+ "language" TEXT,
23
+ "page" INTEGER,
24
+ "props" JSONB,
25
+
26
+ CONSTRAINT "Element_pkey" PRIMARY KEY ("id")
27
+ );
28
+
29
+ -- CreateTable
30
+ CREATE TABLE "User" (
31
+ "id" TEXT NOT NULL DEFAULT gen_random_uuid(),
32
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
33
+ "updatedAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
34
+ "metadata" JSONB NOT NULL,
35
+ "identifier" TEXT NOT NULL,
36
+
37
+ CONSTRAINT "User_pkey" PRIMARY KEY ("id")
38
+ );
39
+
40
+ -- CreateTable
41
+ CREATE TABLE "Feedback" (
42
+ "id" TEXT NOT NULL DEFAULT gen_random_uuid(),
43
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
44
+ "updatedAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
45
+ "stepId" TEXT,
46
+ "name" TEXT NOT NULL,
47
+ "value" DOUBLE PRECISION NOT NULL,
48
+ "comment" TEXT,
49
+
50
+ CONSTRAINT "Feedback_pkey" PRIMARY KEY ("id")
51
+ );
52
+
53
+ -- CreateTable
54
+ CREATE TABLE "Step" (
55
+ "id" TEXT NOT NULL DEFAULT gen_random_uuid(),
56
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
57
+ "updatedAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
58
+ "parentId" TEXT,
59
+ "threadId" TEXT,
60
+ "input" TEXT,
61
+ "metadata" JSONB NOT NULL,
62
+ "name" TEXT,
63
+ "output" TEXT,
64
+ "type" "StepType" NOT NULL,
65
+ "showInput" TEXT DEFAULT 'json',
66
+ "isError" BOOLEAN DEFAULT false,
67
+ "startTime" TIMESTAMP(3) NOT NULL,
68
+ "endTime" TIMESTAMP(3) NOT NULL,
69
+
70
+ CONSTRAINT "Step_pkey" PRIMARY KEY ("id")
71
+ );
72
+
73
+ -- CreateTable
74
+ CREATE TABLE "Thread" (
75
+ "id" TEXT NOT NULL DEFAULT gen_random_uuid(),
76
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
77
+ "updatedAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
78
+ "deletedAt" TIMESTAMP(3),
79
+ "name" TEXT,
80
+ "metadata" JSONB NOT NULL,
81
+ "userId" TEXT,
82
+
83
+ CONSTRAINT "Thread_pkey" PRIMARY KEY ("id")
84
+ );
85
+
86
+ -- CreateIndex
87
+ CREATE INDEX "Element_stepId_idx" ON "Element"("stepId");
88
+
89
+ -- CreateIndex
90
+ CREATE INDEX "Element_threadId_idx" ON "Element"("threadId");
91
+
92
+ -- CreateIndex
93
+ CREATE INDEX "User_identifier_idx" ON "User"("identifier");
94
+
95
+ -- CreateIndex
96
+ CREATE UNIQUE INDEX "User_identifier_key" ON "User"("identifier");
97
+
98
+ -- CreateIndex
99
+ CREATE INDEX "Feedback_createdAt_idx" ON "Feedback"("createdAt");
100
+
101
+ -- CreateIndex
102
+ CREATE INDEX "Feedback_name_idx" ON "Feedback"("name");
103
+
104
+ -- CreateIndex
105
+ CREATE INDEX "Feedback_stepId_idx" ON "Feedback"("stepId");
106
+
107
+ -- CreateIndex
108
+ CREATE INDEX "Feedback_value_idx" ON "Feedback"("value");
109
+
110
+ -- CreateIndex
111
+ CREATE INDEX "Feedback_name_value_idx" ON "Feedback"("name", "value");
112
+
113
+ -- CreateIndex
114
+ CREATE INDEX "Step_createdAt_idx" ON "Step"("createdAt");
115
+
116
+ -- CreateIndex
117
+ CREATE INDEX "Step_endTime_idx" ON "Step"("endTime");
118
+
119
+ -- CreateIndex
120
+ CREATE INDEX "Step_parentId_idx" ON "Step"("parentId");
121
+
122
+ -- CreateIndex
123
+ CREATE INDEX "Step_startTime_idx" ON "Step"("startTime");
124
+
125
+ -- CreateIndex
126
+ CREATE INDEX "Step_threadId_idx" ON "Step"("threadId");
127
+
128
+ -- CreateIndex
129
+ CREATE INDEX "Step_type_idx" ON "Step"("type");
130
+
131
+ -- CreateIndex
132
+ CREATE INDEX "Step_name_idx" ON "Step"("name");
133
+
134
+ -- CreateIndex
135
+ CREATE INDEX "Step_threadId_startTime_endTime_idx" ON "Step"("threadId", "startTime", "endTime");
136
+
137
+ -- CreateIndex
138
+ CREATE INDEX "Thread_createdAt_idx" ON "Thread"("createdAt");
139
+
140
+ -- CreateIndex
141
+ CREATE INDEX "Thread_name_idx" ON "Thread"("name");
142
+
143
+ -- AddForeignKey
144
+ ALTER TABLE "Element" ADD CONSTRAINT "Element_stepId_fkey" FOREIGN KEY ("stepId") REFERENCES "Step"("id") ON DELETE CASCADE ON UPDATE CASCADE;
145
+
146
+ -- AddForeignKey
147
+ ALTER TABLE "Element" ADD CONSTRAINT "Element_threadId_fkey" FOREIGN KEY ("threadId") REFERENCES "Thread"("id") ON DELETE CASCADE ON UPDATE CASCADE;
148
+
149
+ -- AddForeignKey
150
+ ALTER TABLE "Feedback" ADD CONSTRAINT "Feedback_stepId_fkey" FOREIGN KEY ("stepId") REFERENCES "Step"("id") ON DELETE SET NULL ON UPDATE CASCADE;
151
+
152
+ -- AddForeignKey
153
+ ALTER TABLE "Step" ADD CONSTRAINT "Step_parentId_fkey" FOREIGN KEY ("parentId") REFERENCES "Step"("id") ON DELETE CASCADE ON UPDATE CASCADE;
154
+
155
+ -- AddForeignKey
156
+ ALTER TABLE "Step" ADD CONSTRAINT "Step_threadId_fkey" FOREIGN KEY ("threadId") REFERENCES "Thread"("id") ON DELETE CASCADE ON UPDATE CASCADE;
157
+
158
+ -- AddForeignKey
159
+ ALTER TABLE "Thread" ADD CONSTRAINT "Thread_userId_fkey" FOREIGN KEY ("userId") REFERENCES "User"("id") ON DELETE SET NULL ON UPDATE CASCADE;
prisma/migrations/20250108095538_add_tags_to_thread/migration.sql ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ -- AlterTable
2
+ ALTER TABLE "Thread" ADD COLUMN "tags" TEXT[] DEFAULT ARRAY[]::TEXT[];
prisma/migrations/migration_lock.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Please do not edit this file manually
2
+ # It should be added in your version-control system (e.g., Git)
3
+ provider = "postgresql"
prisma/schema.prisma ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ generator client {
2
+ provider = "prisma-client-js"
3
+ interface = "asyncio"
4
+ recursive_type_depth = 5
5
+ previewFeatures = ["postgresqlExtensions"]
6
+ }
7
+
8
+ datasource db {
9
+ provider = "postgresql"
10
+ url = env("DATABASE_URL")
11
+ // Prisma migrations run through the direct URL. Replace as needed.
12
+ directUrl = env("DATABASE_URL")
13
+ extensions = [pgcrypto]
14
+ }
15
+
16
+ model Element {
17
+ id String @id @default(dbgenerated("gen_random_uuid()"))
18
+ createdAt DateTime @default(now())
19
+ updatedAt DateTime @default(now()) @updatedAt
20
+
21
+ threadId String?
22
+ stepId String
23
+ metadata Json
24
+ mime String?
25
+ name String
26
+ objectKey String?
27
+ url String?
28
+ step Step @relation(fields: [stepId], references: [id], onDelete: Cascade)
29
+ thread Thread? @relation(fields: [threadId], references: [id], onDelete: Cascade)
30
+
31
+ chainlitKey String?
32
+ display String?
33
+ size String?
34
+ language String?
35
+ page Int?
36
+ props Json?
37
+
38
+ @@index([stepId])
39
+ @@index([threadId])
40
+ }
41
+
42
+ model User {
43
+ id String @id @default(dbgenerated("gen_random_uuid()"))
44
+ createdAt DateTime @default(now())
45
+ updatedAt DateTime @default(now()) @updatedAt
46
+ metadata Json
47
+ identifier String
48
+ threads Thread[]
49
+
50
+ @@unique([identifier])
51
+ @@index([identifier])
52
+ }
53
+
54
+ model Feedback {
55
+ id String @id @default(dbgenerated("gen_random_uuid()"))
56
+ createdAt DateTime @default(now())
57
+ updatedAt DateTime @default(now()) @updatedAt
58
+
59
+ stepId String?
60
+ Step Step? @relation(fields: [stepId], references: [id])
61
+
62
+ name String
63
+ value Float
64
+
65
+ comment String?
66
+
67
+ @@index(createdAt)
68
+ @@index(name)
69
+ @@index(stepId)
70
+ @@index(value)
71
+ @@index([name, value])
72
+ }
73
+
74
+ model Step {
75
+ id String @id @default(dbgenerated("gen_random_uuid()"))
76
+ createdAt DateTime @default(now())
77
+ updatedAt DateTime @default(now()) @updatedAt
78
+ parentId String?
79
+ threadId String?
80
+
81
+ input String?
82
+ metadata Json
83
+ name String?
84
+ output String?
85
+ type StepType
86
+ showInput String? @default("json")
87
+ isError Boolean? @default(false)
88
+
89
+ startTime DateTime
90
+ endTime DateTime
91
+
92
+ elements Element[]
93
+ parent Step? @relation("ParentChild", fields: [parentId], references: [id], onDelete: Cascade)
94
+ children Step[] @relation("ParentChild")
95
+ thread Thread? @relation(fields: [threadId], references: [id], onDelete: Cascade)
96
+ Feedback Feedback[]
97
+
98
+ @@index([createdAt])
99
+ @@index([endTime])
100
+ @@index([parentId])
101
+ @@index([startTime])
102
+ @@index([threadId])
103
+ @@index([type])
104
+ @@index([name])
105
+ @@index([threadId, startTime, endTime])
106
+ }
107
+
108
+ model Thread {
109
+ id String @id @default(dbgenerated("gen_random_uuid()"))
110
+ createdAt DateTime @default(now())
111
+ updatedAt DateTime @default(now()) @updatedAt
112
+ deletedAt DateTime?
113
+
114
+ name String?
115
+ metadata Json
116
+ tags String[] @default([])
117
+
118
+ elements Element[]
119
+ userId String?
120
+ User User? @relation(fields: [userId], references: [id])
121
+ steps Step[]
122
+
123
+ @@index([createdAt])
124
+ @@index([name])
125
+ }
126
+
127
+ enum StepType {
128
+ assistant_message
129
+ embedding
130
+ llm
131
+ retrieval
132
+ rerank
133
+ run
134
+ system_message
135
+ tool
136
+ undefined
137
+ user_message
138
+ }
public/favicon.svg ADDED
public/logo.png ADDED
requirements.txt CHANGED
@@ -13,3 +13,5 @@ fastapi>=0.100.0
13
  uvicorn>=0.20.0
14
  langchain_nebius>=0.1.0
15
  asyncpg>=0.29.0
 
 
 
13
  uvicorn>=0.20.0
14
  langchain_nebius>=0.1.0
15
  asyncpg>=0.29.0
16
+ ragas>=0.1.0
17
+ datasets>=2.0.0
test_chainlit.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from app import auth # Import your Chainlit handler
5
+ import chainlit as cl
6
+ # Load environment variables
7
+ load_dotenv()
8
+
9
+ class TestClass(unittest.TestCase):
10
+ def test_authentication_valid_credentials(self):
11
+ # Test with valid credentials - should return a User object
12
+ user = auth("admin", os.getenv("PASSWORD"))
13
+ assert isinstance(user, cl.User)
14
+
15
+ def test_authentication_invalid_credentials(self):
16
+ # Test with invalid credentials - should return None
17
+ user = auth("admin", "wrong_password")
18
+ assert not isinstance(user, cl.User)
19
+
20
+ def test_authentication_invalid_username(self):
21
+ # Test with invalid username - should return None
22
+ user = auth("wrong_user", os.getenv("PASSWORD"))
23
+ assert not isinstance(user, cl.User)
24
+
test_ragas.ipynb ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "6bb3bb7d",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "f:\\Dissertation\\prod-rag-chat\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14
+ " from .autonotebook import tqdm as notebook_tqdm\n"
15
+ ]
16
+ },
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "2025-08-16 16:34:21 - Loaded .env file\n"
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "from datasets import Dataset\n",
27
+ "from app import retrieve_relevant_documents, emb_text, model, embedding_model\n",
28
+ "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
29
+ "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
30
+ "from langchain.schema.runnable import RunnableLambda\n",
31
+ "from langchain_core.documents import Document"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "id": "e572fb31",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "def setup_standalone_rag_chain():\n",
42
+ " \"\"\"Setup a standalone RAG chain for testing without Chainlit session.\"\"\"\n",
43
+ " \n",
44
+ " def get_context_and_history(inputs):\n",
45
+ " \"\"\"Retrieve context without session history.\"\"\"\n",
46
+ " query = inputs[\"question\"]\n",
47
+ " relevant_docs = retrieve_relevant_documents(query, limit=5)\n",
48
+ " print(\"Relevant documents:\", relevant_docs[0] if relevant_docs else \"No documents found\")\n",
49
+ " \n",
50
+ " # Convert dictionaries to Document objects for LangChain\n",
51
+ " doc_objects = []\n",
52
+ " for doc in relevant_docs:\n",
53
+ " doc_obj = Document(\n",
54
+ " page_content=doc.get('text', ''),\n",
55
+ " metadata=doc.get('metadata', {})\n",
56
+ " )\n",
57
+ " doc_objects.append(doc_obj)\n",
58
+ "\n",
59
+ " return {\n",
60
+ " \"question\": query,\n",
61
+ " \"context\": doc_objects,\n",
62
+ " \"history\": [] # Empty history for testing\n",
63
+ " }\n",
64
+ " \n",
65
+ " system_prompt = \"\"\"You are a helpful assistant specialising in developing non-discriminatory competence standards and disability support, reasonable adjustments, and equality legislation.\n",
66
+ "\n",
67
+ "When answering questions, you should:\n",
68
+ "1. Use the provided context documents to inform your response\n",
69
+ "2. Be accurate and helpful\n",
70
+ "3. If the context doesn't contain relevant information, say so clearly\n",
71
+ "4. Always reply in English\n",
72
+ "5. Provide clear recommendations and examples wherever applicable\n",
73
+ "6. Do not make assumptions about the user's knowledge or background\n",
74
+ "7. If the user asks for a specific law or regulation, provide a brief explanation and cite relevant documents if available.\n",
75
+ "8. Do not overemphasize disability in your responses, but rather focus on the support and adjustments that can be made to ensure equality and inclusivity.\n",
76
+ "9. If the user query explicitly asks for a disability-related topic, provide a well-informed response based on the context documents.\n",
77
+ "\n",
78
+ "Context documents:\n",
79
+ "{context} \n",
80
+ "\n",
81
+ "Please provide a clear response using the above context\n",
82
+ "\"\"\"\n",
83
+ "\n",
84
+ " prompt = ChatPromptTemplate.from_messages([\n",
85
+ " (\"system\", system_prompt),\n",
86
+ " MessagesPlaceholder(variable_name=\"history\"),\n",
87
+ " (\"human\", \"{question}\"),\n",
88
+ " ])\n",
89
+ "\n",
90
+ " question_answer_chain = create_stuff_documents_chain(model, prompt)\n",
91
+ " \n",
92
+ " # Use a custom chain that properly handles our context and history\n",
93
+ " def process_input_and_format(inputs):\n",
94
+ " context_data = get_context_and_history(inputs)\n",
95
+ " return {\n",
96
+ " \"context\": context_data[\"context\"],\n",
97
+ " \"question\": context_data[\"question\"],\n",
98
+ " \"history\": context_data[\"history\"]\n",
99
+ " }\n",
100
+ " \n",
101
+ " chain = RunnableLambda(process_input_and_format) | question_answer_chain\n",
102
+ " \n",
103
+ " return chain"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 3,
109
+ "id": "330ee35d",
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "\n",
114
+ "# Setup the RAG chain\n",
115
+ "rag_chain = setup_standalone_rag_chain()\n",
116
+ "\n",
117
+ "questions = [\"What are Provisions, Criteria and Practices?\", \n",
118
+ " \"What is 'reasonable'?\",\n",
119
+ " \"What is 'substantial disadvantage'?\",\n",
120
+ " ]\n",
121
+ "ground_truths = [\n",
122
+ " \"\"\"The Equality and Human Rights Commission (EHRC) interprets PCPs as including:3 \n",
123
+ "+ arrangements (for example, for deciding who to admit) \n",
124
+ "+ the way that education, or access to any benefit, service or facility is offered or provided \n",
125
+ "+ one-off or discretionary decisions \n",
126
+ "+ proposals or directions to do something in a particular way \n",
127
+ "+ formal and informal policies \n",
128
+ "+ rules\"\"\",\n",
129
+ "\n",
130
+ " \"\"\"There are two key considerations of 'reasonableness' which can help when thinking through \n",
131
+ "when an adjustment may be reasonable:4 \n",
132
+ "+ Could the adjustment be practicable in its application (is it possible)? \n",
133
+ "+ Could the adjustment be effective in achieving its aim (will it work)? \n",
134
+ "There is no need to prove that the adjustment is practicable and effective in advance, just \n",
135
+ "that it might be. An adjustment should not be considered unreasonable if it does not remove \n",
136
+ "the disadvantage fully; an adjustment which partially removes or reduces substantial \n",
137
+ "disadvantage is also likely to be reasonable.\"\"\",\n",
138
+ "\n",
139
+ " \"\"\"'Substantial' is defined in the Act as 'more than minor or trivial'. \n",
140
+ "Examples of disadvantage recognised by the EHRC include: \n",
141
+ "+ The additional time and effort expended by a disabled student \n",
142
+ "+ The inconvenience, indignity, discomfort, or perceived disadvantage suffered by a \n",
143
+ "disabled student \n",
144
+ "+ The loss of opportunity or diminished progress experienced by a disabled student. \"\"\"]\n",
145
+ "\n",
146
+ "\n",
147
+ "\n"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "id": "ba3810dd",
154
+ "metadata": {},
155
+ "outputs": [
156
+ {
157
+ "name": "stdout",
158
+ "output_type": "stream",
159
+ "text": [
160
+ "2025-08-16 16:34:53 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
161
+ "Relevant documents: {'text': 'What is a provision, criterion or practice? The phrase ‘provision, criterion or practice’ is not defined by the Act. These concepts should be construed widely so as to include, for example, any formal or informal policies, rules, practices, arrangements, criteria, procedures, activities or provisions. They can cover one-off decisions and actions. In simple terms, they are about the way an education provider does things. Example:', 'metadata': {'source': 'data\\\\technical-guidance-further-higher-education.docx', 'file_directory': 'data', 'filename': 'technical-guidance-further-higher-education.docx', 'last_modified': '2025-07-02T21:00:50', 'page_number': 95, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'CompositeElement', 'element_id': '3ae881ad6f81487213a9e234debf0921'}, 'score': 0.7780322432518005}\n",
162
+ "2025-08-16 16:34:53 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
163
+ "2025-08-16 16:35:02 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
164
+ "2025-08-16 16:35:03 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
165
+ "Relevant documents: {'text': '‘Reasonable’ means having regard to all of the circumstances including the nature of the act and how obviously discriminatory it is, the authority of the person making the statement and the knowledge that the helper has or ought to have.', 'metadata': {'source': 'data\\\\technical-guidance-further-higher-education.docx', 'file_directory': 'data', 'filename': 'technical-guidance-further-higher-education.docx', 'last_modified': '2025-07-02T21:00:50', 'page_number': 36, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'CompositeElement', 'element_id': 'c5e3a60e2a6ccc88e0eff961f645a962'}, 'score': 0.734176754951477}\n",
166
+ "2025-08-16 16:35:03 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
167
+ "2025-08-16 16:35:11 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
168
+ "2025-08-16 16:35:12 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
169
+ "Relevant documents: {'text': 'The Act states that disadvantage must be substantial, which is defined as more than minor or trivial. Whether such a disadvantage exists in a particular case is a question of fact, and is assessed on an objective basis. s212(1)', 'metadata': {'source': 'data\\\\technical-guidance-further-higher-education.docx', 'file_directory': 'data', 'filename': 'technical-guidance-further-higher-education.docx', 'last_modified': '2025-07-02T21:00:50', 'page_number': 89, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'CompositeElement', 'element_id': 'b9e8ef04daf9150c9f7e32736b53df5b'}, 'score': 0.8380770087242126}\n",
170
+ "2025-08-16 16:35:12 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
171
+ "2025-08-16 16:35:21 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n"
172
+ ]
173
+ }
174
+ ],
175
+ "source": [
176
+ "answers = []\n",
177
+ "contexts = []\n",
178
+ "\n",
179
+ "def clean_answer(answer):\n",
180
+ " \"\"\"Remove <think></think> tags and content from the answer.\"\"\"\n",
181
+ " import re\n",
182
+ " # Remove everything between <think> and </think> tags, including the tags themselves\n",
183
+ " cleaned = re.sub(r'<think>.*?</think>\\s*', '', answer, flags=re.DOTALL)\n",
184
+ " return cleaned.strip()\n",
185
+ "\n",
186
+ "# Inference\n",
187
+ "for query in questions:\n",
188
+ " # Get answer from the RAG chain\n",
189
+ " answer = rag_chain.invoke({\"question\": query})\n",
190
+ " # Clean the answer to remove thinking content\n",
191
+ " cleaned_answer = clean_answer(answer)\n",
192
+ " answers.append(cleaned_answer)\n",
193
+ " \n",
194
+ " # Get relevant documents for context\n",
195
+ " relevant_docs = retrieve_relevant_documents(query, limit=5)\n",
196
+ " context_texts = [doc['text'] for doc in relevant_docs]\n",
197
+ " contexts.append(context_texts)\n",
198
+ "\n",
199
+ "# To dict\n",
200
+ "data = {\n",
201
+ " \"question\": questions,\n",
202
+ " \"answer\": answers,\n",
203
+ " \"contexts\": contexts,\n",
204
+ " \"reference\": ground_truths\n",
205
+ "}\n",
206
+ "\n",
207
+ "# Convert dict to dataset\n",
208
+ "dataset = Dataset.from_dict(data)\n",
209
+ "\n"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": 5,
215
+ "id": "3e016be2",
216
+ "metadata": {},
217
+ "outputs": [
218
+ {
219
+ "name": "stderr",
220
+ "output_type": "stream",
221
+ "text": [
222
+ "Evaluating: 0%| | 0/12 [00:00<?, ?it/s]"
223
+ ]
224
+ },
225
+ {
226
+ "name": "stdout",
227
+ "output_type": "stream",
228
+ "text": [
229
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
230
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
231
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
232
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
233
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
234
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
235
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
236
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
237
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
238
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
239
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
240
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
241
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
242
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
243
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
244
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
245
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
246
+ "2025-08-16 16:35:22 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
247
+ "2025-08-16 16:35:26 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
248
+ "2025-08-16 16:35:26 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
249
+ "2025-08-16 16:35:27 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
250
+ "2025-08-16 16:35:27 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
251
+ "2025-08-16 16:35:28 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n"
252
+ ]
253
+ },
254
+ {
255
+ "name": "stderr",
256
+ "output_type": "stream",
257
+ "text": [
258
+ "Evaluating: 8%|▊ | 1/12 [00:06<01:13, 6.67s/it]"
259
+ ]
260
+ },
261
+ {
262
+ "name": "stdout",
263
+ "output_type": "stream",
264
+ "text": [
265
+ "2025-08-16 16:35:29 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
266
+ "2025-08-16 16:35:29 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
267
+ "2025-08-16 16:35:29 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
268
+ "2025-08-16 16:35:29 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n"
269
+ ]
270
+ },
271
+ {
272
+ "name": "stderr",
273
+ "output_type": "stream",
274
+ "text": [
275
+ "Evaluating: 17%|█▋ | 2/12 [00:07<00:32, 3.29s/it]"
276
+ ]
277
+ },
278
+ {
279
+ "name": "stdout",
280
+ "output_type": "stream",
281
+ "text": [
282
+ "2025-08-16 16:35:29 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
283
+ "2025-08-16 16:35:29 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
284
+ "2025-08-16 16:35:31 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
285
+ "2025-08-16 16:35:31 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n",
286
+ "2025-08-16 16:35:31 - HTTP Request: POST https://api.studio.nebius.ai/v1/embeddings \"HTTP/1.1 200 OK\"\n"
287
+ ]
288
+ },
289
+ {
290
+ "name": "stderr",
291
+ "output_type": "stream",
292
+ "text": [
293
+ "Evaluating: 25%|██▌ | 3/12 [00:09<00:23, 2.61s/it]"
294
+ ]
295
+ },
296
+ {
297
+ "name": "stdout",
298
+ "output_type": "stream",
299
+ "text": [
300
+ "2025-08-16 16:35:31 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
301
+ "2025-08-16 16:35:32 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
302
+ "2025-08-16 16:35:33 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
303
+ ]
304
+ },
305
+ {
306
+ "name": "stderr",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "Evaluating: 33%|███▎ | 4/12 [00:11<00:19, 2.46s/it]"
310
+ ]
311
+ },
312
+ {
313
+ "name": "stdout",
314
+ "output_type": "stream",
315
+ "text": [
316
+ "2025-08-16 16:35:34 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
317
+ ]
318
+ },
319
+ {
320
+ "name": "stderr",
321
+ "output_type": "stream",
322
+ "text": [
323
+ "Evaluating: 42%|████▏ | 5/12 [00:12<00:13, 1.88s/it]"
324
+ ]
325
+ },
326
+ {
327
+ "name": "stdout",
328
+ "output_type": "stream",
329
+ "text": [
330
+ "2025-08-16 16:35:35 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
331
+ "2025-08-16 16:35:35 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
332
+ "2025-08-16 16:35:35 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
333
+ "2025-08-16 16:35:36 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
334
+ "2025-08-16 16:35:38 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
335
+ "2025-08-16 16:35:39 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
336
+ "2025-08-16 16:35:39 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
337
+ "2025-08-16 16:35:41 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
338
+ "2025-08-16 16:35:43 - HTTP Request: POST https://api.studio.nebius.ai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
339
+ ]
340
+ },
341
+ {
342
+ "name": "stderr",
343
+ "output_type": "stream",
344
+ "text": [
345
+ "Evaluating: 100%|██████████| 12/12 [00:42<00:00, 3.51s/it]\n"
346
+ ]
347
+ }
348
+ ],
349
+ "source": [
350
+ "from ragas import evaluate\n",
351
+ "from ragas.metrics import (\n",
352
+ " faithfulness,\n",
353
+ " answer_relevancy,\n",
354
+ " context_recall,\n",
355
+ " context_precision,\n",
356
+ ")\n",
357
+ "\n",
358
+ "result = evaluate(\n",
359
+ " llm=model,\n",
360
+ " embeddings=embedding_model,\n",
361
+ " dataset = dataset, \n",
362
+ " metrics=[\n",
363
+ " context_precision,\n",
364
+ " context_recall,\n",
365
+ " faithfulness,\n",
366
+ " answer_relevancy,\n",
367
+ " ],\n",
368
+ ")\n",
369
+ "\n",
370
+ "df = result.to_pandas()\n",
371
+ "\n",
372
+ "# evaluation_results = result.to_pandas()\n",
373
+ "\n",
374
+ "# display_columns = ['user_input', 'answer_relevancy', 'faithfulness', 'context_precision', 'context_recall']\n",
375
+ "# formatted_results = evaluation_results[display_columns].to_markdown(index=False, numalign=\"left\", stralign=\"left\")\n",
376
+ "\n",
377
+ "# print(formatted_results)\n"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": 6,
383
+ "id": "d8514ff3",
384
+ "metadata": {
385
+ "slideshow": {
386
+ "slide_type": "slide"
387
+ }
388
+ },
389
+ "outputs": [
390
+ {
391
+ "name": "stdout",
392
+ "output_type": "stream",
393
+ "text": [
394
+ " user_input \\\n",
395
+ "0 What are Provisions, Criteria and Practices? \n",
396
+ "1 What is 'reasonable'? \n",
397
+ "2 What is 'substantial disadvantage'? \n",
398
+ "\n",
399
+ " retrieved_contexts \\\n",
400
+ "0 [What is a provision, criterion or practice? T... \n",
401
+ "1 [‘Reasonable’ means having regard to all of th... \n",
402
+ "2 [The Act states that disadvantage must be subs... \n",
403
+ "\n",
404
+ " response \\\n",
405
+ "0 **Provisions, Criteria, and Practices (PCPs)**... \n",
406
+ "1 The term **\"reasonable\"** in the context of di... \n",
407
+ "2 **Substantial disadvantage** refers to a situa... \n",
408
+ "\n",
409
+ " reference context_precision \\\n",
410
+ "0 The Equality and Human Rights Commission (EHRC... 0.500000 \n",
411
+ "1 There are two key considerations of 'reasonabl... 0.500000 \n",
412
+ "2 'Substantial' is defined in the Act as 'more t... 0.866667 \n",
413
+ "\n",
414
+ " context_recall faithfulness answer_relevancy \n",
415
+ "0 0.857143 0.666667 0.759951 \n",
416
+ "1 0.500000 0.642857 0.616460 \n",
417
+ "2 0.500000 0.937500 0.767732 \n"
418
+ ]
419
+ }
420
+ ],
421
+ "source": [
422
+ "print(df)"
423
+ ]
424
+ }
425
+ ],
426
+ "metadata": {
427
+ "kernelspec": {
428
+ "display_name": ".venv",
429
+ "language": "python",
430
+ "name": "python3"
431
+ },
432
+ "language_info": {
433
+ "codemirror_mode": {
434
+ "name": "ipython",
435
+ "version": 3
436
+ },
437
+ "file_extension": ".py",
438
+ "mimetype": "text/x-python",
439
+ "name": "python",
440
+ "nbconvert_exporter": "python",
441
+ "pygments_lexer": "ipython3",
442
+ "version": "3.12.5"
443
+ }
444
+ },
445
+ "nbformat": 4,
446
+ "nbformat_minor": 5
447
+ }
test_ragas.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset
2
+ from app import retrieve_relevant_documents, emb_text, model, embedding_model
3
+ from langchain.chains.combine_documents import create_stuff_documents_chain
4
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
5
+ from langchain.schema.runnable import RunnableLambda
6
+ from langchain_core.documents import Document
7
+
8
+
9
+ def setup_standalone_rag_chain():
10
+ """Setup a standalone RAG chain for testing without Chainlit session."""
11
+
12
+ def get_context_and_history(inputs):
13
+ """Retrieve context without session history."""
14
+ query = inputs["question"]
15
+ relevant_docs = retrieve_relevant_documents(query, limit=5)
16
+ print("Relevant documents:", relevant_docs[0] if relevant_docs else "No documents found")
17
+
18
+ # Convert dictionaries to Document objects for LangChain
19
+ doc_objects = []
20
+ for doc in relevant_docs:
21
+ doc_obj = Document(
22
+ page_content=doc.get('text', ''),
23
+ metadata=doc.get('metadata', {})
24
+ )
25
+ doc_objects.append(doc_obj)
26
+
27
+ return {
28
+ "question": query,
29
+ "context": doc_objects,
30
+ "history": [] # Empty history for testing
31
+ }
32
+
33
+ system_prompt = """You are a helpful assistant specialising in developing non-discriminatory competence standards and disability support, reasonable adjustments, and equality legislation.
34
+
35
+ When answering questions, you should:
36
+ 1. Use the provided context documents to inform your response
37
+ 2. Be accurate and helpful
38
+ 3. If the context doesn't contain relevant information, say so clearly
39
+ 4. Always reply in English
40
+ 5. Provide clear recommendations and examples wherever applicable
41
+ 6. Do not make assumptions about the user's knowledge or background
42
+ 7. If the user asks for a specific law or regulation, provide a brief explanation and cite relevant documents if available.
43
+ 8. Do not overemphasize disability in your responses, but rather focus on the support and adjustments that can be made to ensure equality and inclusivity.
44
+ 9. If the user query explicitly asks for a disability-related topic, provide a well-informed response based on the context documents.
45
+
46
+ Context documents:
47
+ {context}
48
+
49
+ Please provide a clear response using the above context
50
+ """
51
+
52
+ prompt = ChatPromptTemplate.from_messages([
53
+ ("system", system_prompt),
54
+ MessagesPlaceholder(variable_name="history"),
55
+ ("human", "{question}"),
56
+ ])
57
+
58
+ question_answer_chain = create_stuff_documents_chain(model, prompt)
59
+
60
+ # Use a custom chain that properly handles our context and history
61
+ def process_input_and_format(inputs):
62
+ context_data = get_context_and_history(inputs)
63
+ return {
64
+ "context": context_data["context"],
65
+ "question": context_data["question"],
66
+ "history": context_data["history"]
67
+ }
68
+
69
+ chain = RunnableLambda(process_input_and_format) | question_answer_chain
70
+
71
+ return chain
72
+
73
+ # Setup the RAG chain
74
+ rag_chain = setup_standalone_rag_chain()
75
+
76
+ questions = ["What is a 'reasonable adjustment'?",
77
+ "To whom do competence standards apply?",
78
+ "Do competence standards vary by subject?",
79
+ ]
80
+ ground_truths = [
81
+ """The reasonable adjustments duty contains three requirements, which relate to changing
82
+ how things are done, changing the built environment to avoid such a substantial
83
+ disadvantage and providing auxiliary aids and services. Specifically:
84
+ 1. A duty to make reasonable adjustments to any provision, criterion or practice (PCP)
85
+ which places disabled students at a substantial disadvantage
86
+ 2. A duty to make reasonable adjustments to physical features
87
+ 3. A duty to provide auxiliary aids (including services) """,
88
+
89
+ """The Act does not specify to whom competence standards may be applied but it is clear that
90
+ anti-discrimination provisions apply to prospective and current students (and in some cases
91
+ former students).
92
+ Providers commonly apply competence standards to:13
93
+ + Applicants, to determine whether they have the knowledge and skills necessary to
94
+ participate in and complete a course of study
95
+ + Students, to determine whether they are ready to progress to the next year/stage of
96
+ study, and to determine whether they have demonstrated the requirements in order to be
97
+ awarded a qualification that necessitates a competence standard to be applied.""",
98
+
99
+ """Competence standards can - and should - vary between courses of study. What may
100
+ constitute a competence standard in one subject area may not be justifiable in another. """]
101
+
102
+
103
+ answers = []
104
+ contexts = []
105
+
106
+ def clean_answer(answer):
107
+ """Remove <think></think> tags and content from the answer."""
108
+ import re
109
+ # Remove everything between <think> and </think> tags, including the tags themselves
110
+ cleaned = re.sub(r'<think>.*?</think>\s*', '', answer, flags=re.DOTALL)
111
+ return cleaned.strip()
112
+
113
+ # Inference
114
+ for query in questions:
115
+ # Get answer from the RAG chain
116
+ answer = rag_chain.invoke({"question": query})
117
+ # Clean the answer to remove thinking content
118
+ cleaned_answer = clean_answer(answer)
119
+ answers.append(cleaned_answer)
120
+
121
+ # Get relevant documents for context
122
+ relevant_docs = retrieve_relevant_documents(query, limit=5)
123
+ context_texts = [doc['text'] for doc in relevant_docs]
124
+ contexts.append(context_texts)
125
+
126
+ # To dict
127
+ data = {
128
+ "question": questions,
129
+ "answer": answers,
130
+ "contexts": contexts,
131
+ "reference": ground_truths
132
+ }
133
+
134
+ # Convert dict to dataset
135
+ dataset = Dataset.from_dict(data)
136
+
137
+
138
+ from ragas import evaluate
139
+ from ragas.metrics import (
140
+ faithfulness,
141
+ answer_relevancy,
142
+ context_recall,
143
+ context_precision,
144
+ )
145
+
146
+ result = evaluate(
147
+ llm=model,
148
+ embeddings=embedding_model,
149
+ dataset = dataset,
150
+ metrics=[
151
+ context_precision,
152
+ context_recall,
153
+ faithfulness,
154
+ answer_relevancy,
155
+ ],
156
+ )
157
+
158
+ # df = result.to_pandas()
159
+
160
+ evaluation_results = result.to_pandas()
161
+
162
+ display_columns = ['user_input', 'answer_relevancy', 'faithfulness', 'context_precision', 'context_recall']
163
+ formatted_results = evaluation_results[display_columns].to_markdown(index=False, numalign="left", stralign="left")
164
+
165
+ print(formatted_results)
test_vector_search.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from langchain_nebius import NebiusEmbeddings
4
+ from pydantic import SecretStr
5
+ from pymilvus import MilvusClient
6
+ # Configuration constants
7
+ MILVUS_URI = os.getenv("MILVUS_URI", "http://localhost:19530")
8
+ COLLECTION_NAME = "my_rag_collection"
9
+ DOCUMENT_DIR = "data/"
10
+ EMBEDDING_DIMENSION = 4096
11
+
12
+ milvus_client = MilvusClient(uri=MILVUS_URI)
13
+
14
+ TEXT_MAX_LENGTH = 65000
15
+ CHUNK_SIZE = 100
16
+ BATCH_SIZE = 5
17
+
18
+
19
+ embedding_model = NebiusEmbeddings(
20
+ api_key=SecretStr(os.getenv("NEBIUS_API_KEY", os.getenv("OPENAI_API_KEY"))),
21
+ model="Qwen/Qwen3-Embedding-8B",
22
+ base_url="https://api.studio.nebius.ai/v1"
23
+ )
24
+
25
+ def generate_embedding(text):
26
+ """Generate embedding for a single text."""
27
+ return embedding_model.embed_query(text)
28
+
29
+ def verify_insertion():
30
+ """Verify that data was successfully inserted into Milvus."""
31
+ stats = milvus_client.get_collection_stats(COLLECTION_NAME)
32
+ print(f"Collection stats: {stats}")
33
+
34
+ # Test search functionality
35
+ test_query = "What are competence standards and their purpose?"
36
+ test_embedding = generate_embedding(test_query)
37
+
38
+ search_results = milvus_client.search(
39
+ collection_name=COLLECTION_NAME,
40
+ data=[test_embedding],
41
+ limit=3,
42
+ output_fields=["text", "metadata"]
43
+ )
44
+
45
+ print(f"\nTest search results for '{test_query}':")
46
+ for i, result in enumerate(search_results[0]):
47
+ print(f"Result {i+1}:")
48
+ print(f" Score: {result['distance']:.4f}")
49
+ print(f" Text preview: {result['entity']['text'][:300]}...")
50
+ print(f" Metadata: {result['entity']['metadata']}")
51
+ print("-" * 50)
52
+
53
+ if __name__ == "__main__":
54
+ start_time = time.time()
55
+ print("=" * 60)
56
+
57
+ print("\n Starting test search")
58
+ print("=" * 30)
59
+ verify_insertion()
60
+
61
+ elapsed_time = time.time() - start_time
62
+ print(f"\nTotal execution time: {elapsed_time:.2f} seconds")