vaishnav commited on
Commit
bcf9d83
·
1 Parent(s): c9c38b4

refactor UI and embedding of URL's

Browse files
.claude/settings.local.json CHANGED
@@ -3,7 +3,14 @@
3
  "allow": [
4
  "Bash(ls -la \"D:\\\\MAPS Lab\\\\Others\\\\AIVIZ-BOT\\\\processing\"\" && ls -la \"D:MAPS LabOthersAIVIZ-BOTllm_setup\"\")",
5
  "Bash(git add:*)",
6
- "Bash(git commit:*)"
 
 
 
 
 
 
 
7
  ]
8
  }
9
  }
 
3
  "allow": [
4
  "Bash(ls -la \"D:\\\\MAPS Lab\\\\Others\\\\AIVIZ-BOT\\\\processing\"\" && ls -la \"D:MAPS LabOthersAIVIZ-BOTllm_setup\"\")",
5
  "Bash(git add:*)",
6
+ "Bash(git commit:*)",
7
+ "Bash(python3:*)",
8
+ "Bash(pip3 show:*)",
9
+ "Bash(pip show:*)",
10
+ "WebFetch(domain:www.gradio.app)",
11
+ "WebFetch(domain:github.com)",
12
+ "Bash(.venv/bin/pip install:*)",
13
+ "Bash(python -c:*)"
14
  ]
15
  }
16
  }
app.py CHANGED
@@ -46,13 +46,180 @@ def echo(text, chat_history, request: gr.Request):
46
  def on_reset_button_click():
47
  llm_svc.store=LFUCache(capacity=50)
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if __name__ == '__main__':
50
  logging.info("Starting AIVIz Bot")
51
 
52
- with gr.Blocks() as demo:
53
- gr.ChatInterface(fn=echo, type="messages")
54
- reset_button = gr.Button("Reset Chat Memory Cache")
55
- reset_button.click(on_reset_button_click)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- # Launch the interface
58
  demo.launch()
 
46
  def on_reset_button_click():
47
  llm_svc.store=LFUCache(capacity=50)
48
 
49
+ # --- Maritime Theme ---
50
+ maritime_blue = gr.themes.Color(
51
+ c50="#f0f9ff", c100="#e0f2fe", c200="#b9e6fe", c300="#7dd4fc",
52
+ c400="#38bdf8", c500="#0ea5e9", c600="#0284c7", c700="#0369a1",
53
+ c800="#075985", c900="#0c4a6e", c950="#082f49",
54
+ name="maritime-blue",
55
+ )
56
+
57
+ teal_accent = gr.themes.Color(
58
+ c50="#f0fdfa", c100="#ccfbf1", c200="#99f6e4", c300="#5eead4",
59
+ c400="#2dd4bf", c500="#14b8a6", c600="#0d9488", c700="#0f766e",
60
+ c800="#115e59", c900="#134e4a", c950="#042f2e",
61
+ name="teal-accent",
62
+ )
63
+
64
+ try:
65
+ stormy_theme = gr.themes.Ocean(
66
+ primary_hue=maritime_blue,
67
+ secondary_hue=teal_accent,
68
+ neutral_hue="slate",
69
+ spacing_size=gr.themes.sizes.spacing_md,
70
+ radius_size=gr.themes.sizes.radius_lg,
71
+ text_size=gr.themes.sizes.text_md,
72
+ font=(gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"),
73
+ font_mono=(gr.themes.GoogleFont("JetBrains Mono"), "ui-monospace", "Consolas", "monospace"),
74
+ )
75
+ except AttributeError:
76
+ stormy_theme = gr.themes.Soft(
77
+ primary_hue=maritime_blue,
78
+ secondary_hue=teal_accent,
79
+ neutral_hue="slate",
80
+ spacing_size=gr.themes.sizes.spacing_md,
81
+ radius_size=gr.themes.sizes.radius_lg,
82
+ text_size=gr.themes.sizes.text_md,
83
+ font=(gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"),
84
+ font_mono=(gr.themes.GoogleFont("JetBrains Mono"), "ui-monospace", "Consolas", "monospace"),
85
+ )
86
+
87
+ stormy_theme = stormy_theme.set(
88
+ body_background_fill="#f0f9ff",
89
+ body_background_fill_dark="#0c1929",
90
+ body_text_color="#0c4a6e",
91
+ body_text_color_dark="#e0f2fe",
92
+ block_background_fill="#ffffff",
93
+ block_background_fill_dark="#0f2942",
94
+ block_border_color="#b9e6fe",
95
+ block_border_color_dark="#0369a1",
96
+ button_primary_background_fill="linear-gradient(135deg, #0ea5e9, #0d9488)",
97
+ button_primary_background_fill_hover="linear-gradient(135deg, #38bdf8, #14b8a6)",
98
+ button_primary_background_fill_dark="linear-gradient(135deg, #0369a1, #0f766e)",
99
+ button_primary_text_color="#ffffff",
100
+ button_secondary_background_fill="#e0f2fe",
101
+ button_secondary_background_fill_hover="#b9e6fe",
102
+ button_secondary_background_fill_dark="#0f2942",
103
+ button_secondary_text_color="#0c4a6e",
104
+ button_secondary_text_color_dark="#7dd4fc",
105
+ input_background_fill="#f8fafc",
106
+ input_background_fill_dark="#0f2942",
107
+ input_border_color="#b9e6fe",
108
+ input_border_color_focus="#0ea5e9",
109
+ input_border_color_dark="#0369a1",
110
+ shadow_drop="0 2px 8px rgba(14, 165, 233, 0.08)",
111
+ shadow_drop_lg="0 4px 16px rgba(14, 165, 233, 0.12)",
112
+ )
113
+
114
+ custom_css = """
115
+ .stormy-header {
116
+ text-align: center;
117
+ padding: 1.5rem 1rem 1rem 1rem;
118
+ background: linear-gradient(135deg, #0c4a6e 0%, #0ea5e9 50%, #0d9488 100%);
119
+ border-radius: 12px;
120
+ margin-bottom: 0.5rem;
121
+ color: white;
122
+ }
123
+ .stormy-header h1 {
124
+ font-size: 1.8rem;
125
+ margin: 0 0 0.25rem 0;
126
+ font-weight: 700;
127
+ color: #ffffff !important;
128
+ }
129
+ .stormy-header p {
130
+ font-size: 0.95rem;
131
+ margin: 0;
132
+ color: #e0f2fe !important;
133
+ opacity: 0.9;
134
+ }
135
+ .reset-btn {
136
+ max-width: 200px !important;
137
+ }
138
+ .stormy-footer {
139
+ text-align: center;
140
+ font-size: 0.8rem;
141
+ color: #64748b;
142
+ padding-top: 0.5rem;
143
+ }
144
+ """
145
+
146
  if __name__ == '__main__':
147
  logging.info("Starting AIVIz Bot")
148
 
149
+ with gr.Blocks(theme=stormy_theme, css=custom_css, title="Stormy - AISdb Assistant") as demo:
150
+
151
+ # Branding Header
152
+ gr.Markdown(
153
+ """
154
+ <div class="stormy-header">
155
+ <h1>Stormy - AISdb Assistant</h1>
156
+ <p>Your maritime data companion. Ask about AIS vessel tracking, data processing, machine learning, and more.</p>
157
+ </div>
158
+ """,
159
+ elem_id="header",
160
+ )
161
+
162
+ # Chat Interface
163
+ chatbot = gr.Chatbot(
164
+ placeholder=(
165
+ "<strong>Welcome aboard!</strong><br>"
166
+ "I'm Stormy, your AISdb documentation assistant.<br>"
167
+ "Ask me about vessel tracking, data queries, or machine learning with AIS data."
168
+ ),
169
+ height=500,
170
+ type="messages",
171
+ show_copy_button=True,
172
+ )
173
+
174
+ gr.ChatInterface(
175
+ fn=echo,
176
+ type="messages",
177
+ chatbot=chatbot,
178
+ textbox=gr.Textbox(
179
+ placeholder="Ask Stormy about AISdb...",
180
+ container=False,
181
+ scale=7,
182
+ ),
183
+ examples=[
184
+ "How do I get started with AISdb?",
185
+ "How can I query vessel tracks by MMSI?",
186
+ "What machine learning models work with AIS data?",
187
+ "How do I visualize ship trajectories on a map?",
188
+ ],
189
+ )
190
+
191
+ # Action Bar
192
+ with gr.Row():
193
+ with gr.Column(scale=3):
194
+ with gr.Accordion("About Stormy & AISdb", open=False):
195
+ gr.Markdown(
196
+ """
197
+ **Stormy** is an AI assistant built on the AISdb (Automatic Identification System Database)
198
+ documentation. It can help you with:
199
+
200
+ - **Data Access**: Loading AIS data, creating databases, CSV export
201
+ - **Querying**: SQL queries, filtering by MMSI, time ranges, geographic areas
202
+ - **Processing**: Data cleaning, track interpolation, decimation
203
+ - **Visualization**: Plotting vessel trajectories, hexagon discretization
204
+ - **Machine Learning**: Seq2Seq models, autoencoders for AIS data
205
+ - **Geospatial**: Haversine distance, shore distance, bathymetric data
206
+
207
+ Powered by AISdb documentation from [aisviz.gitbook.io](https://aisviz.gitbook.io/documentation)
208
+ and [MAPS Lab](https://mapslab.tech/).
209
+ """
210
+ )
211
+ with gr.Column(scale=1, min_width=200):
212
+ reset_button = gr.Button(
213
+ "Reset Chat Memory",
214
+ variant="secondary",
215
+ size="sm",
216
+ elem_classes=["reset-btn"],
217
+ )
218
+ reset_button.click(on_reset_button_click)
219
+
220
+ # Footer
221
+ gr.Markdown(
222
+ '<div class="stormy-footer">Built with Gradio & LangChain | AISdb Documentation Assistant</div>'
223
+ )
224
 
 
225
  demo.launch()
configs/config.py CHANGED
@@ -68,7 +68,6 @@ URLS = ["https://aisviz.gitbook.io/documentation",
68
  ]
69
  CHUNK_SIZE = 768
70
  CHUNK_OVERLAP = 100
71
- TOTAL_RESULTS = 2389
72
  MAX_SIZE = 100
73
  EMBEDDINGS = HuggingFaceEmbeddings(
74
  model_name="sentence-transformers/all-mpnet-base-v2",
 
68
  ]
69
  CHUNK_SIZE = 768
70
  CHUNK_OVERLAP = 100
 
71
  MAX_SIZE = 100
72
  EMBEDDINGS = HuggingFaceEmbeddings(
73
  model_name="sentence-transformers/all-mpnet-base-v2",
processing/documents.py CHANGED
@@ -31,15 +31,21 @@ def format_documents(docs: list[Document]) -> str:
31
  return "\n\n".join(doc.page_content for doc in docs)
32
 
33
 
34
- def split_documents(documents: Iterable[Document]) -> list[Document]:
35
  """
36
  Splits documents into smaller chunks.
37
 
38
  Args:
39
  documents (Iterable[Document]): The documents to split.
 
 
40
 
41
  Returns:
42
  list[Document]: A list of split documents.
43
  """
44
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
 
 
 
 
45
  return text_splitter.split_documents(documents)
 
31
  return "\n\n".join(doc.page_content for doc in docs)
32
 
33
 
34
+ def split_documents(documents: Iterable[Document], chunk_size: int = 768, chunk_overlap: int = 100) -> list[Document]:
35
  """
36
  Splits documents into smaller chunks.
37
 
38
  Args:
39
  documents (Iterable[Document]): The documents to split.
40
+ chunk_size (int): Maximum size of each chunk in characters.
41
+ chunk_overlap (int): Number of overlapping characters between chunks.
42
 
43
  Returns:
44
  list[Document]: A list of split documents.
45
  """
46
+ text_splitter = RecursiveCharacterTextSplitter(
47
+ chunk_size=chunk_size,
48
+ chunk_overlap=chunk_overlap,
49
+ separators=["\n\n", "\n", ". ", " ", ""],
50
+ )
51
  return text_splitter.split_documents(documents)
processing/texts.py CHANGED
@@ -1,6 +1,12 @@
 
 
 
1
  def clean_text(text: str) -> str:
2
  """
3
- Clean the text by removing unwanted characters and formatting.
4
  """
5
- cleaned_text = text.replace("\n", " ").strip()
6
- return cleaned_text
 
 
 
 
1
+ import re
2
+
3
+
4
  def clean_text(text: str) -> str:
5
  """
6
+ Clean the text by removing unwanted characters while preserving document structure.
7
  """
8
+ text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) # remove control chars (keep \n \t \r)
9
+ text = re.sub(r'\r\n?', '\n', text) # normalize line endings
10
+ text = re.sub(r'\n{3,}', '\n\n', text) # collapse excessive newlines
11
+ text = re.sub(r'[^\S\n]{3,}', ' ', text) # collapse excessive spaces (not newlines)
12
+ return text.strip()
services/scraper.py CHANGED
@@ -1,6 +1,7 @@
1
  from langchain.schema import Document
2
 
3
- from processing.documents import load_documents, format_documents, split_documents
 
4
  from processing.texts import clean_text
5
 
6
 
@@ -17,10 +18,13 @@ class Service:
17
  for url in urls:
18
  try:
19
  website_documents = load_documents(url)
20
- formatted_content = format_documents(website_documents)
21
- cleaned_content = clean_text(formatted_content)
22
- documents.append(Document(page_content=cleaned_content))
 
23
  except Exception as e:
24
  raise Exception(f"Error processing {url}: {e}")
25
 
26
- self.store.store_embeddings(split_documents(documents))
 
 
 
1
  from langchain.schema import Document
2
 
3
+ import configs.config as config
4
+ from processing.documents import load_documents, split_documents
5
  from processing.texts import clean_text
6
 
7
 
 
18
  for url in urls:
19
  try:
20
  website_documents = load_documents(url)
21
+ for doc in website_documents:
22
+ doc.page_content = clean_text(doc.page_content)
23
+ doc.metadata["source"] = url
24
+ documents.append(doc)
25
  except Exception as e:
26
  raise Exception(f"Error processing {url}: {e}")
27
 
28
+ self.store.store_embeddings(
29
+ split_documents(documents, chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP)
30
+ )
stores/chroma.py CHANGED
@@ -15,6 +15,9 @@ class ChromaDB:
15
  def store_embeddings(self, documents: list[Document]):
16
  """
17
  Store embeddings for the documents using HuggingFace embeddings and Chroma vectorstore.
 
18
  """
 
 
19
  self.chroma.add_documents(documents=documents, embeddings=self.embeddings,
20
  persist_directory=self._persistent_directory)
 
15
  def store_embeddings(self, documents: list[Document]):
16
  """
17
  Store embeddings for the documents using HuggingFace embeddings and Chroma vectorstore.
18
+ Skips ingestion if the collection is already populated.
19
  """
20
+ if self.chroma._collection.count() > 0:
21
+ return
22
  self.chroma.add_documents(documents=documents, embeddings=self.embeddings,
23
  persist_directory=self._persistent_directory)