daniel-was-taken commited on
Commit
ace5cd4
·
1 Parent(s): 1db8512

Deployment ready

Browse files
Files changed (9) hide show
  1. .chainlit/config.toml +130 -0
  2. .env.example +17 -0
  3. .gitignore +18 -0
  4. Dockerfile +24 -0
  5. app.py +290 -0
  6. chainlit.md +14 -0
  7. compose.yml +82 -0
  8. populate_db.py +170 -0
  9. requirements.txt +14 -0
.chainlit/config.toml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: false). No personal data is collected.
3
+ enable_telemetry = false
4
+
5
+
6
+ # List of environment variables to be provided by each user to use the app.
7
+ user_env = []
8
+
9
+ # Duration (in seconds) during which the session is saved when the connection is lost
10
+ session_timeout = 3600
11
+
12
+ # Duration (in seconds) of the user session expiry
13
+ user_session_timeout = 1296000 # 15 days
14
+
15
+ # Enable third parties caching (e.g., LangChain cache)
16
+ cache = false
17
+
18
+ # Authorized origins
19
+ allow_origins = ["*"]
20
+
21
+ [features]
22
+ # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
23
+ unsafe_allow_html = false
24
+
25
+ # Process and display mathematical expressions. This can clash with "$" characters in messages.
26
+ latex = false
27
+
28
+ # Autoscroll new user messages at the top of the window
29
+ user_message_autoscroll = true
30
+
31
+ # Automatically tag threads with the current chat profile (if a chat profile is used)
32
+ auto_tag_thread = true
33
+
34
+ # Allow users to edit their own messages
35
+ edit_message = true
36
+
37
+ # Authorize users to spontaneously upload files with messages
38
+ [features.spontaneous_file_upload]
39
+ enabled = false
40
+ # Define accepted file types using MIME types
41
+ # Examples:
42
+ # 1. For specific file types:
43
+ # accept = ["image/jpeg", "image/png", "application/pdf"]
44
+ # 2. For all files of certain type:
45
+ # accept = ["image/*", "audio/*", "video/*"]
46
+ # 3. For specific file extensions:
47
+ # accept = { "application/octet-stream" = [".xyz", ".pdb"] }
48
+ # Note: Using "*/*" is not recommended as it may cause browser warnings
49
+ accept = ["*/*"]
50
+ max_files = 20
51
+ max_size_mb = 500
52
+
53
+ [features.audio]
54
+ # Sample rate of the audio
55
+ sample_rate = 24000
56
+
57
+ [features.mcp.sse]
58
+ enabled = true
59
+
60
+ [features.mcp.stdio]
61
+ enabled = true
62
+ # Only the executables in the allow list can be used for MCP stdio server.
63
+ # Only need the base name of the executable, e.g. "npx", not "/usr/bin/npx".
64
+ # Please don't comment this line for now, we need it to parse the executable name.
65
+ allowed_executables = [ "npx", "uvx" ]
66
+
67
+ [UI]
68
+ # Name of the assistant.
69
+ name = "Assistant"
70
+
71
+ # default_theme = "dark"
72
+
73
+ # layout = "wide"
74
+
75
+ # default_sidebar_state = "open"
76
+
77
+ # Description of the assistant. This is used for HTML tags.
78
+ # description = ""
79
+
80
+ # Chain of Thought (CoT) display mode. Can be "hidden", "tool_call" or "full".
81
+ cot = "full"
82
+
83
+ # Specify a CSS file that can be used to customize the user interface.
84
+ # The CSS file can be served from the public directory or via an external link.
85
+ # custom_css = "/public/test.css"
86
+
87
+ # Specify additional attributes for a custom CSS file
88
+ # custom_css_attributes = "media=\"print\""
89
+
90
+ # Specify a JavaScript file that can be used to customize the user interface.
91
+ # The JavaScript file can be served from the public directory.
92
+ # custom_js = "/public/test.js"
93
+
94
+ # The style of alert boxes. Can be "classic" or "modern".
95
+ alert_style = "classic"
96
+
97
+ # Specify additional attributes for custom JS file
98
+ # custom_js_attributes = "async type = \"module\""
99
+
100
+ # Custom login page image, relative to public directory or external URL
101
+ # login_page_image = "/public/custom-background.jpg"
102
+
103
+ # Custom login page image filter (Tailwind internal filters, no dark/light variants)
104
+ # login_page_image_filter = "brightness-50 grayscale"
105
+ # login_page_image_dark_filter = "contrast-200 blur-sm"
106
+
107
+
108
+ # Specify a custom meta image url.
109
+ # custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
110
+
111
+ # Load assistant logo directly from URL.
112
+ logo_file_url = ""
113
+
114
+ # Load assistant avatar image directly from URL.
115
+ default_avatar_file_url = ""
116
+
117
+ # Specify a custom build directory for the frontend.
118
+ # This can be used to customize the frontend code.
119
+ # Be careful: If this is a relative path, it should not start with a slash.
120
+ # custom_build = "./public/build"
121
+
122
+ # Specify optional one or more custom links in the header.
123
+ # [[UI.header_links]]
124
+ # name = "Issues"
125
+ # display_name = "Report Issue"
126
+ # icon_url = "https://avatars.githubusercontent.com/u/128686189?s=200&v=4"
127
+ # url = "https://github.com/Chainlit/chainlit/issues"
128
+
129
+ [meta]
130
+ generated_by = "2.6.0"
.env.example ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ACCESS_TOKEN=''
2
+
3
+ NEBIUS_API_KEY=''
4
+
5
+ PASSWORD=''
6
+
7
+ OPENAI_API_KEY=''
8
+
9
+ CHAINLIT_AUTH_SECRET=""
10
+
11
+ DATABASE_URL=''
12
+
13
+ OAUTH_GOOGLE_CLIENT_ID=''
14
+ OAUTH_GOOGLE_CLIENT_SECRET=''
15
+ OAUTH_URI=""
16
+
17
+ DOCKER_VOLUME_DIRECTORY=/opt/app/volumes
.gitignore CHANGED
@@ -205,3 +205,21 @@ cython_debug/
205
  marimo/_static/
206
  marimo/_lsp/
207
  __marimo__/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  marimo/_static/
206
  marimo/_lsp/
207
  __marimo__/
208
+
209
+
210
+ # Google Client Secrets
211
+ *.json
212
+
213
+ # Production secrets and environment files
214
+ .env
215
+ .env.local
216
+ .env.production
217
+ .env.staging
218
+
219
+ secrets/
220
+
221
+ # DigitalOcean App Platform
222
+ .do/
223
+
224
+ # Docker volumes (production)
225
+ volumes/
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.12-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file into the container at /app
8
+ COPY requirements.txt .
9
+
10
+ # Install any needed packages specified in requirements.txt
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the rest of the application's code into the container at /app
14
+ COPY . .
15
+
16
+ # Make port 8000 available to the world outside this container
17
+ EXPOSE 8000
18
+
19
+ # Define environment variable
20
+ ENV CHAINLIT_HOST=0.0.0.0
21
+ ENV CHAINLIT_PORT=8000
22
+
23
+ # Run app.py when the container launches
24
+ CMD ["chainlit", "run", "app.py"]
app.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, List, Optional
3
+ from operator import itemgetter
4
+
5
+ import chainlit as cl
6
+ from chainlit.types import ThreadDict
7
+ from chainlit.data.sql_alchemy import SQLAlchemyDataLayer
8
+ from pydantic import SecretStr
9
+
10
+
11
+ from langchain.chains import create_retrieval_chain
12
+ from langchain.chains.combine_documents import create_stuff_documents_chain
13
+ from langchain_nebius import ChatNebius
14
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
15
+ from langchain.schema.output_parser import StrOutputParser
16
+ from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
17
+ from langchain.schema.runnable.config import RunnableConfig
18
+ from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
19
+
20
+ from pymilvus import MilvusClient
21
+ from sentence_transformers import SentenceTransformer
22
+ from chainlit.input_widget import Select, Switch, Slider
23
+
24
+ from langchain_core.documents import Document
25
+ from typing_extensions import List
26
+
27
+ from populate_db import main
28
+
29
+ # Initialize Milvus client and embedding model
30
+ MILVUS_URI = os.getenv("MILVUS_URI", "http://localhost:19530")
31
+ milvus_client = MilvusClient(uri=MILVUS_URI)
32
+ collection_name = "my_rag_collection"
33
+
34
+ if not milvus_client.has_collection(collection_name):
35
+ main()
36
+
37
+ embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
38
+
39
+ # Initialize LLM
40
+ model = ChatNebius(
41
+ model="Qwen/Qwen3-14B",
42
+ streaming=True,
43
+ temperature=0.7,
44
+ top_p=0.95,
45
+ api_key=SecretStr(os.getenv("OPENAI_API_KEY")),
46
+ )
47
+
48
+ # Define application steps
49
+
50
+ def emb_text(text: str) -> List[float]:
51
+ """Generate embeddings for text using the sentence transformer model."""
52
+ return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]
53
+
54
+ def retrieve_relevant_documents(query: str, limit: int = 5) -> List[Dict]:
55
+ """Retrieve relevant documents from Milvus based on the query."""
56
+ try:
57
+ query_embedding = emb_text(query)
58
+ search_results = milvus_client.search(
59
+ collection_name=collection_name,
60
+ data=[query_embedding],
61
+ limit=limit,
62
+ output_fields=["text", "metadata"]
63
+ )
64
+ # print("search_results:", search_results[0])
65
+ documents = []
66
+ for result in search_results[0]:
67
+ doc_info = {
68
+ "text": result['entity']['text'],
69
+ "metadata": result['entity']['metadata'],
70
+ "score": result['distance']
71
+ }
72
+ documents.append(doc_info)
73
+
74
+ return documents
75
+ except Exception as e:
76
+ print(f"Error retrieving documents: {e}")
77
+ return []
78
+
79
+ def format_docs_with_id(docs: List[Dict]) -> str:
80
+ formatted = []
81
+ for i, doc in enumerate(docs):
82
+ # Extract title and page_number from metadata, with fallbacks
83
+ metadata = doc.get('metadata', {})
84
+ title = metadata.get('filename', 'Unknown Document') # Use filename as fallback for title
85
+ page_number = metadata.get('page_number', 'Unknown')
86
+ score = doc.get('score', 'N/A') # Use score if available
87
+ text_content = doc.get('text', '')
88
+
89
+ formatted_doc = f"[{i+1}] Source: {title}, Page: {page_number}, Score: {score}\nContent: {text_content}"
90
+ formatted.append(formatted_doc)
91
+
92
+ print(f"Formatted documents: {formatted}")
93
+ return "\n\n".join(formatted)
94
+
95
+
96
+ def setup_rag_chain():
97
+ """Setup the RAG chain with context retrieval."""
98
+
99
+ def get_context_and_history(inputs):
100
+ """Retrieve context and get conversation history."""
101
+ query = inputs["question"]
102
+ relevant_docs = retrieve_relevant_documents(query, limit=5)
103
+ print("Relevant documents:", relevant_docs[0] if relevant_docs else "No documents found")
104
+
105
+ # Convert dictionaries to Document objects for LangChain
106
+ doc_objects = []
107
+ for doc in relevant_docs:
108
+ doc_obj = Document(
109
+ page_content=doc.get('text', ''),
110
+ metadata=doc.get('metadata', {})
111
+ )
112
+ doc_objects.append(doc_obj)
113
+
114
+ # Format citations for reference
115
+ citations = format_docs_with_id(relevant_docs)
116
+
117
+ # Add citations to the last document's metadata so it's available to the prompt
118
+ if doc_objects:
119
+ doc_objects[-1].metadata['formatted_citations'] = citations
120
+
121
+ return {
122
+ "question": query,
123
+ "context": doc_objects,
124
+ "history": cl.user_session.get("messages", [])
125
+ }
126
+ system_prompt = """You are a helpful assistant specialising in disability support, reasonable adjustments, and equality legislation.
127
+
128
+ When answering questions, you should:
129
+ 1. Use the provided context documents to inform your response
130
+ 2. Be accurate and helpful
131
+ 3. If the context doesn't contain relevant information, say so clearly
132
+ 4. Always reply in English
133
+ 5. Provide clear recommendations wherever applicable
134
+ 6. Do not make assumptions about the user's knowledge or background
135
+ 7. If the user asks for a specific law or regulation, provide a brief explanation and cite relevant documents if available.
136
+ 8. Do not overlook the importance of accessibility and inclusivity in your responses.
137
+ 9. Do not overemphasize disability in your responses, but rather focus on the support and adjustments that can be made to ensure equality and inclusivity.
138
+ 10. If the user asks about a specific disability, provide general information and resources, but do not make assumptions about the individual's experience or needs.
139
+ 11. If the user query explicitly asks for a disability-related topic, provide a well-informed response based on the context documents.
140
+
141
+ Context documents:
142
+ {context}
143
+
144
+ Please provide a clear response using the above context"""
145
+
146
+ # Get the current settings to check if Think mode is enabled
147
+ settings = cl.user_session.get("settings", {})
148
+ use_think = settings.get("Think", True) # Default to True as per the initial setting
149
+
150
+ if not use_think:
151
+ system_prompt = '/no_think ' + system_prompt
152
+
153
+ prompt = ChatPromptTemplate.from_messages([
154
+ ("system", system_prompt),
155
+ MessagesPlaceholder(variable_name="history"),
156
+ ("human", "{question}"),
157
+ ])
158
+
159
+ question_answer_chain = create_stuff_documents_chain(model, prompt)
160
+
161
+ # Use a custom chain that properly handles our context and history
162
+ def process_input_and_format(inputs):
163
+ context_data = get_context_and_history(inputs)
164
+ return {
165
+ "context": context_data["context"],
166
+ "question": context_data["question"],
167
+ "history": context_data["history"]
168
+ }
169
+
170
+ chain = RunnableLambda(process_input_and_format) | question_answer_chain
171
+
172
+ return chain
173
+
174
+
175
+ # ============== Application Setup ==============
176
+
177
+
178
+ # Authentication
179
+ @cl.password_auth_callback
180
+ def auth(username: str, password: str) -> Optional[cl.User]:
181
+ if (username, password) == ("admin", SecretStr(os.getenv("PASSWORD"))):
182
+ return cl.User(
183
+ identifier="admin",
184
+ metadata={"role": "admin", "provider": "credentials"},
185
+ id="admin_id"
186
+ )
187
+ return None
188
+
189
+ @cl.oauth_callback
190
+ def oauth_callback(
191
+ provider_id: str,
192
+ token: str,
193
+ raw_user_data: Dict[str, str],
194
+ default_user: cl.User,
195
+ ) -> Optional[cl.PersistedUser]:
196
+ return default_user
197
+
198
+ # Starters
199
+ @cl.set_starters
200
+ async def set_starters():
201
+ return [
202
+ cl.Starter(
203
+ label="Considerations for Autistic People",
204
+ message="What considerations should be made for autistic people?",
205
+ ),
206
+ cl.Starter(
207
+ label="Explain Equality Act 2010",
208
+ message="Explain the Equality Act 2010 in simple terms.",
209
+ ),
210
+ ]
211
+
212
+
213
+ # Chat lifecycle
214
+ @cl.on_chat_start
215
+ async def on_chat_start():
216
+ settings = await cl.ChatSettings(
217
+ [
218
+ Switch(id="Think", label="Use Deep Thinking", initial=True),
219
+ ]
220
+ ).send()
221
+
222
+ # Store initial settings
223
+ cl.user_session.set("settings", {"Think": True}) # Set the default value
224
+
225
+ """Initialize chat session with RAG chain."""
226
+ chain = setup_rag_chain()
227
+ cl.user_session.set("chain", chain)
228
+ cl.user_session.set("messages", [])
229
+
230
+
231
+ @cl.on_settings_update
232
+ async def setup_agent(settings):
233
+ print("on_settings_update", settings)
234
+ # Store the settings in the user session so they can be accessed in setup_rag_chain
235
+ cl.user_session.set("settings", settings)
236
+
237
+ # Update the chain with the new settings
238
+ chain = setup_rag_chain()
239
+ cl.user_session.set("chain", chain)
240
+
241
+
242
+ @cl.on_chat_resume
243
+ async def on_chat_resume(thread: ThreadDict):
244
+ """Resume chat with conversation history."""
245
+ messages = []
246
+ root_messages = [m for m in thread["steps"] if m["parentId"] is None]
247
+
248
+ for message in root_messages:
249
+ if message["type"] == "user_message":
250
+ messages.append(HumanMessage(content=message["output"]))
251
+ else:
252
+ messages.append(AIMessage(content=message["output"]))
253
+
254
+ cl.user_session.set("messages", messages)
255
+
256
+
257
+ chain = setup_rag_chain()
258
+ cl.user_session.set("chain", chain)
259
+
260
+ @cl.on_message
261
+ async def on_message(message: cl.Message):
262
+
263
+ """Handle incoming messages with RAG and conversation history."""
264
+ chain = cl.user_session.get("chain")
265
+ messages = cl.user_session.get("messages", [])
266
+ cb = cl.AsyncLangchainCallbackHandler(
267
+ stream_final_answer=True, answer_prefix_tokens=["</think> "]
268
+ )
269
+
270
+ try:
271
+ # Get the relevant documents for citations
272
+ relevant_docs = retrieve_relevant_documents(message.content, limit=5)
273
+ citations = format_docs_with_id(relevant_docs)
274
+
275
+ answer = await chain.ainvoke({"question": message.content}, config=RunnableConfig(callbacks=[cb]))
276
+
277
+ async with cl.Step(name="References") as step:
278
+ if relevant_docs:
279
+ step.output = citations
280
+ else:
281
+ step.output = "No relevant documents found for this query."
282
+
283
+ # Update conversation history
284
+ messages.append(HumanMessage(content=message.content))
285
+ messages.append(AIMessage(content=answer))
286
+
287
+ cl.user_session.set("messages", messages)
288
+
289
+ except Exception as e:
290
+ await cl.Message(content=f"Sorry, I encountered an error: {str(e)}").send()
chainlit.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Welcome to Chainlit! 🚀🤖
2
+
3
+ Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
4
+
5
+ ## Useful Links 🔗
6
+
7
+ - **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
8
+ - **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
9
+
10
+ We can't wait to see what you create with Chainlit! Happy coding! 💻😊
11
+
12
+ ## Welcome screen
13
+
14
+ To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
compose.yml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.5'
2
+
3
+ services:
4
+ etcd:
5
+ container_name: milvus-etcd
6
+ image: quay.io/coreos/etcd:v3.5.18
7
+ environment:
8
+ - ETCD_AUTO_COMPACTION_MODE=revision
9
+ - ETCD_AUTO_COMPACTION_RETENTION=1000
10
+ - ETCD_QUOTA_BACKEND_BYTES=4294967296
11
+ - ETCD_SNAPSHOT_COUNT=50000
12
+ volumes:
13
+ - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
14
+ command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
15
+ healthcheck:
16
+ test: ["CMD", "etcdctl", "endpoint", "health"]
17
+ interval: 30s
18
+ timeout: 20s
19
+ retries: 3
20
+
21
+ minio:
22
+ container_name: milvus-minio
23
+ image: minio/minio:RELEASE.2024-05-28T17-19-04Z
24
+ environment:
25
+ MINIO_ACCESS_KEY: minioadmin
26
+ MINIO_SECRET_KEY: minioadmin
27
+ ports:
28
+ - "9001:9001"
29
+ - "9000:9000"
30
+ volumes:
31
+ - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
32
+ command: minio server /minio_data --console-address ":9001"
33
+ healthcheck:
34
+ test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
35
+ interval: 30s
36
+ timeout: 20s
37
+ retries: 3
38
+
39
+ standalone:
40
+ container_name: milvus-standalone
41
+ image: milvusdb/milvus:v2.5.14
42
+ command: ["milvus", "run", "standalone"]
43
+ security_opt:
44
+ - seccomp:unconfined
45
+ environment:
46
+ MINIO_REGION: us-east-1
47
+ ETCD_ENDPOINTS: etcd:2379
48
+ MINIO_ADDRESS: minio:9000
49
+ volumes:
50
+ - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
51
+ healthcheck:
52
+ test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
53
+ interval: 30s
54
+ start_period: 90s
55
+ timeout: 20s
56
+ retries: 3
57
+ ports:
58
+ - "19530:19530"
59
+ - "9091:9091"
60
+ depends_on:
61
+ etcd:
62
+ condition: service_healthy
63
+ minio:
64
+ condition: service_healthy
65
+
66
+ app:
67
+ container_name: rag-app
68
+ image: rag-app
69
+ build:
70
+ context: .
71
+ dockerfile: Dockerfile
72
+ ports:
73
+ - "8000:8000"
74
+ environment:
75
+ - MILVUS_URI=http://standalone:19530
76
+ - OPENAI_API_KEY=${OPENAI_API_KEY}
77
+ depends_on:
78
+ - standalone
79
+
80
+ networks:
81
+ default:
82
+ name: milvus
populate_db.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from pathlib import Path
3
+ from unstructured.cleaners.core import clean_extra_whitespace, replace_unicode_quotes, clean_dashes, group_broken_paragraphs
4
+ from langchain_unstructured import UnstructuredLoader
5
+ from sentence_transformers import SentenceTransformer
6
+ from pymilvus import MilvusClient, DataType
7
+
8
+ # Initialize Milvus client and collection setup
9
+ milvus_client = MilvusClient(uri="http://localhost:19530")
10
+ collection_name = "my_rag_collection"
11
+
12
+ # Drop existing collection if it exists
13
+ if milvus_client.has_collection(collection_name):
14
+ milvus_client.drop_collection(collection_name)
15
+
16
+ # Initialize embedding model
17
+ embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
18
+
19
+ def emb_text(text):
20
+ """Generate embeddings for text using the sentence transformer model."""
21
+ return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]
22
+
23
+ # Create Milvus collection schema
24
+ schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
25
+ schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
26
+ schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=384) # BGE-small-en-v1.5 dimension
27
+ schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=32768) # 32KB max
28
+ schema.add_field(field_name="metadata", datatype=DataType.JSON)
29
+
30
+ # Create index for vector search
31
+ index_params = MilvusClient.prepare_index_params()
32
+ index_params.add_index(
33
+ field_name="vector",
34
+ metric_type="COSINE",
35
+ index_type="AUTOINDEX",
36
+ )
37
+
38
+ # Create and load collection
39
+ milvus_client.create_collection(
40
+ collection_name=collection_name,
41
+ schema=schema,
42
+ index_params=index_params,
43
+ consistency_level="Strong",
44
+ )
45
+ milvus_client.load_collection(collection_name=collection_name)
46
+
47
+ # Document directory
48
+ directory_path = "data/"
49
+
50
+ def main():
51
+ """Main function to load documents and insert them into Milvus."""
52
+ docs = unstructured_document_loader()
53
+
54
+ # Prepare data for insertion
55
+ data_to_insert = []
56
+
57
+ print(f"Processing {len(docs)} documents for insertion...")
58
+
59
+ for i, doc in enumerate(docs):
60
+ # Check text length and truncate if necessary
61
+ text_content = doc.page_content
62
+ if len(text_content) > 32000: # Leave some buffer below 32KB limit
63
+ text_content = text_content[:32000]
64
+ print(f"Document {i+1} truncated from {len(doc.page_content)} to {len(text_content)} characters")
65
+
66
+ # Generate embedding for the document content
67
+ embedding = emb_text(text_content)
68
+
69
+ # Prepare the data entry
70
+ data_entry = {
71
+ "id": i,
72
+ "vector": embedding,
73
+ "text": text_content,
74
+ "metadata": doc.metadata if doc.metadata else {}
75
+ }
76
+
77
+ data_to_insert.append(data_entry)
78
+
79
+ # Print progress every 100 documents
80
+ if (i + 1) % 100 == 0:
81
+ print(f"Processed {i + 1}/{len(docs)} documents")
82
+
83
+ print(f"Inserting {len(data_to_insert)} documents into Milvus...")
84
+
85
+ # Insert data into Milvus
86
+ insert_result = milvus_client.insert(
87
+ collection_name=collection_name,
88
+ data=data_to_insert
89
+ )
90
+
91
+ print(f"Successfully inserted {insert_result['insert_count']} documents")
92
+ print(f"Primary keys: {insert_result['ids'][:10]}...") # Show first 10 IDs
93
+
94
+ return docs
95
+
96
+ def unstructured_document_loader():
97
+ """Load documents using UnstructuredLoader."""
98
+ # Collect file paths for PDF, DOCX, and HTML files
99
+ file_extensions = ["*.pdf", "*.docx", "*.html"]
100
+ file_paths = []
101
+
102
+ for ext in file_extensions:
103
+ file_paths.extend(Path(directory_path).glob(ext))
104
+
105
+ # Convert Path objects to strings
106
+ file_paths = [str(file) for file in file_paths]
107
+
108
+ # Configure UnstructuredLoader with post-processors
109
+ loader = UnstructuredLoader(
110
+ file_paths,
111
+ chunking_strategy="by_title",
112
+ include_orig_elements=False,
113
+ post_processors=[
114
+ clean_extra_whitespace,
115
+ replace_unicode_quotes,
116
+ clean_dashes,
117
+ group_broken_paragraphs
118
+ ]
119
+ )
120
+
121
+ docs = loader.load()
122
+ print(f"Number of LangChain documents: {len(docs)}")
123
+ print(f"Length of first document: {len(docs[0].page_content)} characters")
124
+ print(f"First document preview: {docs[0].page_content[:200]}...")
125
+
126
+ return docs
127
+
128
+ def verify_insertion():
129
+ """Verify that data was successfully inserted into Milvus."""
130
+ # Get collection statistics
131
+ stats = milvus_client.get_collection_stats(collection_name)
132
+ print(f"Collection stats: {stats}")
133
+
134
+ # Test search functionality with a sample query
135
+ test_query = "Questions by staff to other staff"
136
+ test_embedding = emb_text(test_query)
137
+
138
+ search_results = milvus_client.search(
139
+ collection_name=collection_name,
140
+ data=[test_embedding],
141
+ limit=3,
142
+ output_fields=["text", "metadata"]
143
+ )
144
+
145
+ print(f"\nTest search results for '{test_query}':")
146
+ for i, result in enumerate(search_results[0]):
147
+ print(f"Result {i+1}:")
148
+ print(f" Score: {result['distance']:.4f}")
149
+ print(f" Text preview: {result['entity']['text'][:200]}...")
150
+ print(f" Metadata: {result['entity']['metadata']}")
151
+ print("-" * 50)
152
+
153
+
154
+ if __name__ == "__main__":
155
+ start_time = time.time()
156
+
157
+ print("="*60)
158
+ print("STARTING DOCUMENT PROCESSING AND MILVUS INSERTION")
159
+ print("="*60)
160
+
161
+ main()
162
+
163
+ print("\n" + "="*50)
164
+ print("VERIFYING DATA INSERTION")
165
+ print("="*50)
166
+ verify_insertion()
167
+
168
+ end_time = time.time()
169
+ elapsed_time = end_time - start_time
170
+ print(f"\nTotal execution time: {elapsed_time:.2f} seconds")
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chainlit>=1.0.0
2
+ openai>=1.0.0
3
+ huggingface_hub>=0.20.0
4
+ pymilvus>=2.3.0
5
+ sentence-transformers>=2.2.0
6
+ unstructured[all-docs]>=0.10.0
7
+ langchain-unstructured>=0.1.0
8
+ langchain>=0.1.0
9
+ langchain-core>=0.1.0
10
+ python-dotenv>=1.0.0
11
+ pydantic>=2.0.0
12
+ fastapi>=0.100.0
13
+ uvicorn>=0.20.0
14
+ langchain_nebius>=0.1.0