mbudisic commited on
Commit
9063e00
·
1 Parent(s): 5f0c5d1

Moving RAG to app

Browse files
.vscode/settings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "python.pythonPath": "/home/mbudisic/Documents/PsTuts-RAG/.venv/bin/python"
3
+ }
app.py CHANGED
@@ -1,11 +1,69 @@
 
1
  import chainlit as cl
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  @cl.on_message
4
  async def main(message: cl.Message):
5
  # Send a response back to the user
6
- await cl.Message(
7
- content=f"Hello! You said: {message.content}"
8
- ).send()
9
 
10
 
11
  if __name__ == "__main__":
 
1
+ from typing import List
2
  import chainlit as cl
3
+ import json
4
+
5
+ from langchain_experimental.text_splitter import SemanticChunker
6
+ from langchain_openai.embeddings import OpenAIEmbeddings
7
+ from langchain_core.documents import Document
8
+
9
+ from langchain_qdrant import QdrantVectorStore
10
+ from qdrant_client import QdrantClient
11
+ from qdrant_client.http.models import Distance, VectorParams
12
+ from dataclasses import dataclass
13
+
14
+ import pstuts_rag.datastore
15
+
16
+
17
+ @dataclass
18
+ class ApplicationParameters:
19
+ filename = "data/test.json"
20
+ embedding_model = "text-embedding-3-small"
21
+
22
+
23
+ class ApplicationState:
24
+ embeddings: OpenAIEmbeddings = None
25
+ docs: List[Document] = []
26
+ qdrantclient: QdrantClient = None
27
+ vectorstore: QdrantVectorStore = None
28
+ n_context_docs = 2
29
+ retriever = None
30
+
31
+
32
+ state = ApplicationState()
33
+
34
+
35
+ @cl.on_chat_start
36
+ async def on_chat_start():
37
+ params = ApplicationParameters()
38
+
39
+ await cl.Message(content=f"Loading file {params.filename}").send()
40
+ data = json.load(open(params.filename, "rb"))
41
+
42
+ state.embeddings = OpenAIEmbeddings(model=params.embedding_model)
43
+ state.docs = pstuts_rag.datastore.transcripts_load(data, state.embeddings)
44
+ await cl.Message(
45
+ content=f"Loaded {len(state.docs)} chunks from file {params.filename}."
46
+ ).send()
47
+
48
+ state.qdrantclient = QdrantClient(":memory:")
49
+
50
+ state.vectorstore = pstuts_rag.datastore.initialize_vectorstore(
51
+ client=state.qdrantclient,
52
+ collection_name=f"{params.filename}_qdrant",
53
+ embeddings=state.embeddings,
54
+ )
55
+
56
+ _ = state.vectorstore.add_documents(documents=state.docs)
57
+ state.retriever = state.vectorstore.as_retriever(
58
+ search_kwargs={"k": state.n_context_docs}
59
+ )
60
+
61
 
62
  @cl.on_message
63
  async def main(message: cl.Message):
64
  # Send a response back to the user
65
+
66
+ await cl.Message(content=f"Hello! You said: {message.content}").send()
 
67
 
68
 
69
  if __name__ == "__main__":
notebooks/transcript_rag.ipynb ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "from getpass import getpass\n",
11
+ "\n",
12
+ "from dotenv import load_dotenv\n",
13
+ "\n"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "import pstuts_rag"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 3,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "%load_ext autoreload\n",
32
+ "%autoreload 2\n"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 4,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "\n",
42
+ "load_dotenv()\n",
43
+ "\n",
44
+ "def set_api_key_if_not_present(key_name, prompt_message=\"\"):\n",
45
+ " if len(prompt_message) == 0:\n",
46
+ " prompt_message=key_name\n",
47
+ " if key_name not in os.environ or not os.environ[key_name]:\n",
48
+ " os.environ[key_name] = getpass.getpass(prompt_message)\n",
49
+ "\n",
50
+ "set_api_key_if_not_present(\"OPENAI_API_KEY\")"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "metadata": {},
56
+ "source": [
57
+ "# Data Preparation\n",
58
+ "\n",
59
+ "First, we will read in the transcripts of the videos and convert them to Documents\n",
60
+ "with appropriate metadata."
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 6,
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": [
69
+ "import json\n",
70
+ "filename = \"../data/test.json\"\n",
71
+ "\n",
72
+ "data = json.load(open(filename, \"rb\"))\n"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": 8,
78
+ "metadata": {},
79
+ "outputs": [],
80
+ "source": [
81
+ "from langchain_experimental.text_splitter import SemanticChunker\n",
82
+ "from langchain_openai.embeddings import OpenAIEmbeddings\n",
83
+ "from pstuts_rag.datastore import transcripts_load\n",
84
+ "\n",
85
+ "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
86
+ "docs_chunks_semantic = transcripts_load(data,embeddings)"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "markdown",
91
+ "metadata": {},
92
+ "source": [
93
+ "## R - retrieval"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "markdown",
98
+ "metadata": {},
99
+ "source": [
100
+ "Let's hit it with a semantic chunker."
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 45,
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "from langchain_qdrant import QdrantVectorStore\n",
110
+ "from qdrant_client import QdrantClient\n",
111
+ "from qdrant_client.http.models import Distance, VectorParams\n",
112
+ "\n",
113
+ "client = QdrantClient(\":memory:\")\n",
114
+ "\n",
115
+ "collection_name = f\"{filename}_qdrant\"\n",
116
+ "\n",
117
+ "client.create_collection(\n",
118
+ " collection_name=collection_name,\n",
119
+ " vectors_config=VectorParams(size=1536, distance=Distance.COSINE),\n",
120
+ ")\n",
121
+ "\n",
122
+ "vector_store = QdrantVectorStore(\n",
123
+ " client=client,\n",
124
+ " collection_name=collection_name,\n",
125
+ " embedding=embeddings,\n",
126
+ ")"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 46,
132
+ "metadata": {},
133
+ "outputs": [],
134
+ "source": [
135
+ "_ = vector_store.add_documents(documents=docs_chunks_semantic)"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 47,
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": [
144
+ "retriever = vector_store.as_retriever(search_kwargs={\"k\":2})\n",
145
+ "\n",
146
+ "def retrieve(state):\n",
147
+ " retrieved_docs = retriever.invoke(state[\"question\"])\n",
148
+ " return {\"context\":retrieved_docs}\n"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": null,
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "a = retrieve({\"question\":\"What is a layer?\"})\n",
158
+ "[ pp(d.page_content) for d in a[\"context\"] ]"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "markdown",
163
+ "metadata": {},
164
+ "source": [
165
+ "## A - Augmentation\n",
166
+ "\n",
167
+ "We need to populate a prompt for LLM.\n"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 49,
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "from langchain.prompts import ChatPromptTemplate\n",
177
+ "\n",
178
+ "SYSTEM_PROMPT = \"\"\"\\\n",
179
+ "You are a helpful an expert on Photoshop and your goal is to help users\n",
180
+ "gain knowledge from a database of training videos. \n",
181
+ "You answer questions based on provided context. \n",
182
+ "Your answers use emojis for emphasis.\n",
183
+ "\n",
184
+ "IMPORTANT: You must only use the provided context, and cannot use your own knowledge.\n",
185
+ "If there is no context that corresponds to the query, respond by saying\n",
186
+ "\"I don't know. This is not available in our training library.\"\n",
187
+ "\n",
188
+ "Most of the users questions will be in the form:\n",
189
+ "\"How can I do ...\"\n",
190
+ "or\n",
191
+ "\"What is ...\"\n",
192
+ "\n",
193
+ "When appropriate, provide your answers in a step-by-step form.\n",
194
+ "ALWAYS list the URL and the title of the reference video.\n",
195
+ "NEVER invent the explanation. ALWAYS use ONLY the context information.\n",
196
+ "\n",
197
+ "\"\"\"\n",
198
+ "\n",
199
+ "RAG_PROMPT=\"\"\"\\\n",
200
+ "\n",
201
+ "### Question\n",
202
+ "{question}\n",
203
+ "\n",
204
+ "NEVER invent the explanation. ALWAYS use ONLY the context information.\n",
205
+ "\n",
206
+ "### Context\n",
207
+ "{context}\n",
208
+ "\n",
209
+ "\n",
210
+ "\"\"\"\n",
211
+ "\n",
212
+ "rag_prompt = ChatPromptTemplate(\n",
213
+ " [(\"system\",SYSTEM_PROMPT), \n",
214
+ " (\"human\",RAG_PROMPT)\n",
215
+ " ]\n",
216
+ " )"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "markdown",
221
+ "metadata": {},
222
+ "source": [
223
+ "## Generation\n",
224
+ "\n",
225
+ "We will use a 4.1-nano to generate answers."
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 50,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "from langchain_openai import ChatOpenAI\n",
235
+ "\n",
236
+ "llm = ChatOpenAI(model=\"gpt-4.1-nano\",temperature=0)"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": 51,
242
+ "metadata": {},
243
+ "outputs": [],
244
+ "source": [
245
+ "def generate(state):\n",
246
+ " docs_content = \"\\n\\n\".join(doc.page_content for doc in state[\"context\"])\n",
247
+ "\n",
248
+ " references = [ \n",
249
+ " {k: doc.metadata[k] for k in (\"title\",\"source\",\"start\",\"stop\")} \n",
250
+ " for doc in state[\"context\"] \n",
251
+ " ] \n",
252
+ "\n",
253
+ "\n",
254
+ " messages = rag_prompt.format_messages(question=state[\"question\"], \n",
255
+ " context=docs_content)\n",
256
+ " response = llm.invoke(messages)\n",
257
+ " retval = {\"response\":f\"{response.content}\\n\\n**References**:\\n{json.dumps(references,indent=2)}\",\n",
258
+ " \"context\":state[\"context\"]}\n",
259
+ " \n",
260
+ " return retval\n"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 52,
266
+ "metadata": {},
267
+ "outputs": [],
268
+ "source": [
269
+ "from langgraph.graph import START, StateGraph\n",
270
+ "from typing_extensions import List, TypedDict,Annotated\n",
271
+ "from langchain_core.documents import Document\n",
272
+ "from langchain_core.messages import AIMessage, BaseMessage, HumanMessage\n",
273
+ "from langchain_openai.chat_models import ChatOpenAI\n",
274
+ "import operator\n",
275
+ "\n",
276
+ "class State(TypedDict):\n",
277
+ " question: str\n",
278
+ " context: List[Document]\n",
279
+ " response: str\n",
280
+ " \n",
281
+ "graph_builder = StateGraph(State).add_sequence([retrieve, generate ])\n",
282
+ "graph_builder.add_edge(START, \"retrieve\")\n",
283
+ "graph = graph_builder.compile()"
284
+ ]
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "execution_count": 53,
289
+ "metadata": {},
290
+ "outputs": [],
291
+ "source": [
292
+ "from langchain.schema.output_parser import StrOutputParser\n",
293
+ "response = graph.invoke({\"question\" : \"What is the layer in Photoshop\"})"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": null,
299
+ "metadata": {},
300
+ "outputs": [],
301
+ "source": [
302
+ "response.keys()"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": null,
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "type(response)"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": null,
317
+ "metadata": {},
318
+ "outputs": [],
319
+ "source": [
320
+ "pp(response)"
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "code",
325
+ "execution_count": null,
326
+ "metadata": {},
327
+ "outputs": [],
328
+ "source": [
329
+ "response.keys()"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": null,
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": []
338
+ }
339
+ ],
340
+ "metadata": {
341
+ "kernelspec": {
342
+ "display_name": ".venv",
343
+ "language": "python",
344
+ "name": "python3"
345
+ },
346
+ "language_info": {
347
+ "codemirror_mode": {
348
+ "name": "ipython",
349
+ "version": 3
350
+ },
351
+ "file_extension": ".py",
352
+ "mimetype": "text/x-python",
353
+ "name": "python",
354
+ "nbconvert_exporter": "python",
355
+ "pygments_lexer": "ipython3",
356
+ "version": "3.13.2"
357
+ }
358
+ },
359
+ "nbformat": 4,
360
+ "nbformat_minor": 2
361
+ }
pstuts_rag/pstuts_rag.egg-info/PKG-INFO ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: pstuts_rag
3
+ Version: 0.1
4
+ Summary: PsTuts rag system
5
+ Dynamic: summary
pstuts_rag/pstuts_rag.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ setup.py
2
+ pstuts_rag/__init__.py
3
+ pstuts_rag/loader.py
4
+ pstuts_rag.egg-info/PKG-INFO
5
+ pstuts_rag.egg-info/SOURCES.txt
6
+ pstuts_rag.egg-info/dependency_links.txt
7
+ pstuts_rag.egg-info/not-zip-safe
8
+ pstuts_rag.egg-info/top_level.txt
pstuts_rag/pstuts_rag.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
pstuts_rag/pstuts_rag.egg-info/not-zip-safe ADDED
@@ -0,0 +1 @@
 
 
1
+
pstuts_rag/pstuts_rag.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pstuts_rag
pstuts_rag/pstuts_rag/__init__.py ADDED
File without changes
pstuts_rag/pstuts_rag/datastore.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Iterator
2
+ import functools
3
+
4
+
5
+ from langchain_experimental.text_splitter import SemanticChunker
6
+ from langchain_openai.embeddings import OpenAIEmbeddings
7
+ from langchain_core.documents import Document
8
+
9
+ from pstuts_rag.loader import VideoTranscriptBulkLoader, VideoTranscriptLoader
10
+
11
+ from langchain_qdrant import QdrantVectorStore
12
+ from qdrant_client import QdrantClient
13
+ from qdrant_client.http.models import Distance, VectorParams
14
+
15
+
16
+ def transcripts_load(
17
+ json_transcripts: List[Dict],
18
+ embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model="text-embedding-3-small"),
19
+ ) -> List[Document]:
20
+ """
21
+ Load and process video transcripts into semantically chunked documents.
22
+
23
+ This function takes a list of transcript dictionaries, loads them as both full
24
+ transcripts and individual chunks, then applies semantic chunking. It also
25
+ enriches each semantic chunk with timestamp metadata from the original verbatim chunks.
26
+
27
+ Args:
28
+ json_transcripts: List of dictionaries containing video transcript data
29
+ embeddings: OpenAI embeddings model to use for semantic chunking
30
+
31
+ Returns:
32
+ List of semantically chunked Document objects with enhanced metadata
33
+ """
34
+
35
+ docs_full_transcript = VideoTranscriptBulkLoader(json_transcripts).load()
36
+ docs_chunks_verbatim = VideoTranscriptLoader(json_transcripts).load()
37
+
38
+ text_splitter = SemanticChunker(embeddings)
39
+
40
+ docs_chunks_semantic: List[Document] = text_splitter.split_documents(
41
+ docs_full_transcript
42
+ )
43
+
44
+ def is_subchunk(a: Document, ofb: Document) -> bool:
45
+ return (a.metadata["video_id"] == ofb.metadata["video_id"]) and (
46
+ a.page_content in ofb.page_content
47
+ )
48
+
49
+ # Create a lookup dictionary for faster access
50
+ video_id_to_chunks = {}
51
+ for chunk in docs_chunks_verbatim:
52
+ video_id = chunk.metadata["video_id"]
53
+ if video_id not in video_id_to_chunks:
54
+ video_id_to_chunks[video_id] = []
55
+ video_id_to_chunks[video_id].append(chunk)
56
+
57
+ for chunk in docs_chunks_semantic:
58
+ video_id = chunk.metadata["video_id"]
59
+ # Only check chunks from the same video
60
+ potential_subchunks = video_id_to_chunks.get(video_id, [])
61
+ subchunks = [
62
+ c for c in potential_subchunks if c.page_content in chunk.page_content
63
+ ]
64
+
65
+ times = [(t.metadata["time_start"], t.metadata["time_end"]) for t in subchunks]
66
+ chunk.metadata["speech_start_stop_times"] = times
67
+
68
+ if times: # Avoid IndexError if times is empty
69
+ chunk.metadata["start"], chunk.metadata["stop"] = times[0][0], times[-1][-1]
70
+ else:
71
+ chunk.metadata["start"], chunk.metadata["stop"] = None, None
72
+
73
+ return docs_chunks_semantic
74
+
75
+
76
+ def initialize_vectorstore(
77
+ client: QdrantClient, collection_name: str, embeddings: OpenAIEmbeddings
78
+ ) -> QdrantVectorStore:
79
+ client.create_collection(
80
+ collection_name=collection_name,
81
+ vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
82
+ )
83
+
84
+ vector_store = QdrantVectorStore(
85
+ client=client,
86
+ collection_name=collection_name,
87
+ embedding=embeddings,
88
+ )
89
+ return vector_store
pstuts_rag/pstuts_rag/loader.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.document_loaders import BaseLoader
2
+ from typing import List, Dict, Iterator
3
+ from langchain_core.documents import Document
4
+
5
+
6
+ class VideoTranscriptBulkLoader(BaseLoader):
7
+ """Loads video transcripts as a bulk into documents"""
8
+
9
+ def __init__(self, json_payload: List[Dict]):
10
+
11
+ self.json_payload = json_payload
12
+
13
+ def lazy_load(self) -> Iterator[Document]:
14
+ """Lazy loader that returns an iterator"""
15
+
16
+ for video in self.json_payload:
17
+ metadata = dict(video)
18
+ metadata.pop("transcripts", None)
19
+ metadata.pop("qa", None)
20
+ # Rename 'url' key to 'source' in metadata if it exists
21
+ if "url" in metadata:
22
+ metadata["source"] = metadata.pop("url")
23
+ yield Document(
24
+ page_content="\n".join(t["sent"] for t in video["transcripts"]),
25
+ metadata=metadata,
26
+ )
27
+
28
+
29
+ class VideoTranscriptLoader(BaseLoader):
30
+ """Loads video transcripts as individual chunks into documents"""
31
+
32
+ def __init__(self, json_payload: List[Dict]):
33
+
34
+ self.json_payload = json_payload
35
+
36
+ def lazy_load(self) -> Iterator[Document]:
37
+ """Lazy loader that returns an iterator"""
38
+
39
+ for video in self.json_payload:
40
+ metadata = dict(video)
41
+ transcripts = metadata.pop("transcripts", None)
42
+ metadata.pop("qa", None)
43
+ # Rename 'url' key to 'source' in metadata if it exists
44
+ if "url" in metadata:
45
+ metadata["source"] = metadata.pop("url")
46
+ for transcript in transcripts:
47
+ yield Document(
48
+ page_content=transcript["sent"],
49
+ metadata=metadata
50
+ | {
51
+ "time_start": transcript["begin"],
52
+ "time_end": transcript["end"],
53
+ },
54
+ )
pstuts_rag/setup.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+
3
+ setup(name='pstuts_rag',
4
+ version='0.1',
5
+ description='PsTuts rag system',
6
+ packages=['pstuts_rag'],
7
+ zip_safe=False)
pyproject.toml CHANGED
@@ -1,5 +1,5 @@
1
  [project]
2
- name = "pstuts-rag"
3
  version = "2025.05.12"
4
  description = "Agentic RAG system for PsTuts dataset"
5
  readme = "README.md"
@@ -33,11 +33,8 @@ dependencies = [
33
  "unstructured>=0.17.2",
34
  "uvicorn>=0.25.0,<0.26.0",
35
  "websockets==14.2",
36
-
37
  ]
38
- authors = [
39
- { name="Marko Budisic", email="mbudisic@gmail.com" }
40
- ]
41
  license = "MIT"
42
 
43
  [build-system]
@@ -45,4 +42,46 @@ requires = ["hatchling >= 1.26"]
45
  build-backend = "hatchling.build"
46
 
47
  [tool.hatch.build.targets.wheel]
48
- packages = ["pstuts-rag/pstuts-rag"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  [project]
2
+ name = "pstuts_rag"
3
  version = "2025.05.12"
4
  description = "Agentic RAG system for PsTuts dataset"
5
  readme = "README.md"
 
33
  "unstructured>=0.17.2",
34
  "uvicorn>=0.25.0,<0.26.0",
35
  "websockets==14.2",
 
36
  ]
37
+ authors = [{ name = "Marko Budisic", email = "mbudisic@gmail.com" }]
 
 
38
  license = "MIT"
39
 
40
  [build-system]
 
42
  build-backend = "hatchling.build"
43
 
44
  [tool.hatch.build.targets.wheel]
45
+ packages = ["pstuts_rag/pstuts_rag"]
46
+
47
+ # [project.optional-dependencies]
48
+ # dev = [
49
+ # "pytest>=7.0.0",
50
+ # "black>=22.0.0",
51
+ # "flake8>=4.0.0",
52
+ # "mypy>=0.900",
53
+ # ]
54
+
55
+ # [tool.ruff]
56
+ # line-length = 88
57
+ # target-version = "py313"
58
+ # select = ["E", "F", "I", "N", "W"]
59
+ # ignore = []
60
+
61
+ # [tool.ruff.isort]
62
+ # known-first-party = ["src"]
63
+
64
+ # [tool.black]
65
+ # line-length = 88
66
+ # target-version = ["py313"]
67
+
68
+ # [tool.mypy]
69
+ # python_version = "3.13"
70
+ # warn_return_any = true
71
+ # warn_unused_configs = true
72
+ # disallow_untyped_defs = true
73
+ # mypy_path = ["pstuts_rag/pstuts_rag"]
74
+ # namespace_packages = true
75
+ # explicit_package_bases = true
76
+
77
+ # [tool.flake8]
78
+ # application-import-names = "pstuts_rag"
79
+ # extend-ignore = "E203,W503"
80
+
81
+ # [tool.pylint.MASTER]
82
+ # load-plugins = "pylint_venv" # optional but handy
83
+ # source-roots = "pstuts_rag"
84
+ # extension-pkg-allow-list = "numpy, torch" # compiled deps that astroid cannot parse
85
+
86
+ # [tool.pylint.TYPECHECK]
87
+ # ignored-modules = "pkg_resources" # suppress noisy vendored imports