Spaces:

mbudisic
/

PsTuts-RAG

Sleeping

App Files Files Community

mbudisic commited on May 13, 2025

Commit

9063e00

1 Parent(s): 5f0c5d1

Moving RAG to app

Browse files

Files changed (13) hide show

.vscode/settings.json +3 -0
app.py +61 -3
notebooks/transcript_rag.ipynb +361 -0
pstuts_rag/pstuts_rag.egg-info/PKG-INFO +5 -0
pstuts_rag/pstuts_rag.egg-info/SOURCES.txt +8 -0
pstuts_rag/pstuts_rag.egg-info/dependency_links.txt +1 -0
pstuts_rag/pstuts_rag.egg-info/not-zip-safe +1 -0
pstuts_rag/pstuts_rag.egg-info/top_level.txt +1 -0
pstuts_rag/pstuts_rag/__init__.py +0 -0
pstuts_rag/pstuts_rag/datastore.py +89 -0
pstuts_rag/pstuts_rag/loader.py +54 -0
pstuts_rag/setup.py +7 -0
pyproject.toml +45 -6

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "python.pythonPath": "/home/mbudisic/Documents/PsTuts-RAG/.venv/bin/python"
+}

app.py CHANGED Viewed

@@ -1,11 +1,69 @@
 import chainlit as cl
 @cl.on_message
 async def main(message: cl.Message):
     # Send a response back to the user
-    await cl.Message(
-        content=f"Hello! You said: {message.content}"
-    ).send()
 if __name__ == "__main__":

+from typing import List
 import chainlit as cl
+import json
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_core.documents import Document
+from langchain_qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import Distance, VectorParams
+from dataclasses import dataclass
+import pstuts_rag.datastore
+@dataclass
+class ApplicationParameters:
+    filename = "data/test.json"
+    embedding_model = "text-embedding-3-small"
+class ApplicationState:
+    embeddings: OpenAIEmbeddings = None
+    docs: List[Document] = []
+    qdrantclient: QdrantClient = None
+    vectorstore: QdrantVectorStore = None
+    n_context_docs = 2
+    retriever = None
+state = ApplicationState()
+@cl.on_chat_start
+async def on_chat_start():
+    params = ApplicationParameters()
+    await cl.Message(content=f"Loading file {params.filename}").send()
+    data = json.load(open(params.filename, "rb"))
+    state.embeddings = OpenAIEmbeddings(model=params.embedding_model)
+    state.docs = pstuts_rag.datastore.transcripts_load(data, state.embeddings)
+    await cl.Message(
+        content=f"Loaded {len(state.docs)} chunks from file {params.filename}."
+    ).send()
+    state.qdrantclient = QdrantClient(":memory:")
+    state.vectorstore = pstuts_rag.datastore.initialize_vectorstore(
+        client=state.qdrantclient,
+        collection_name=f"{params.filename}_qdrant",
+        embeddings=state.embeddings,
+    )
+    _ = state.vectorstore.add_documents(documents=state.docs)
+    state.retriever = state.vectorstore.as_retriever(
+        search_kwargs={"k": state.n_context_docs}
+    )
 @cl.on_message
 async def main(message: cl.Message):
     # Send a response back to the user
+    await cl.Message(content=f"Hello! You said: {message.content}").send()
 if __name__ == "__main__":

notebooks/transcript_rag.ipynb ADDED Viewed

	@@ -0,0 +1,361 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from getpass import getpass\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pstuts_rag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "def set_api_key_if_not_present(key_name, prompt_message=\"\"):\n",
+    "    if len(prompt_message) == 0:\n",
+    "        prompt_message=key_name\n",
+    "    if key_name not in os.environ or not os.environ[key_name]:\n",
+    "        os.environ[key_name] = getpass.getpass(prompt_message)\n",
+    "\n",
+    "set_api_key_if_not_present(\"OPENAI_API_KEY\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Preparation\n",
+    "\n",
+    "First, we will read in the transcripts of the videos and convert them to Documents\n",
+    "with appropriate metadata."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "filename = \"../data/test.json\"\n",
+    "\n",
+    "data = json.load(open(filename, \"rb\"))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_experimental.text_splitter import SemanticChunker\n",
+    "from langchain_openai.embeddings import OpenAIEmbeddings\n",
+    "from pstuts_rag.datastore import transcripts_load\n",
+    "\n",
+    "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
+    "docs_chunks_semantic = transcripts_load(data,embeddings)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## R - retrieval"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's hit it with a semantic chunker."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_qdrant import QdrantVectorStore\n",
+    "from qdrant_client import QdrantClient\n",
+    "from qdrant_client.http.models import Distance, VectorParams\n",
+    "\n",
+    "client = QdrantClient(\":memory:\")\n",
+    "\n",
+    "collection_name = f\"{filename}_qdrant\"\n",
+    "\n",
+    "client.create_collection(\n",
+    "    collection_name=collection_name,\n",
+    "    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),\n",
+    ")\n",
+    "\n",
+    "vector_store = QdrantVectorStore(\n",
+    "    client=client,\n",
+    "    collection_name=collection_name,\n",
+    "    embedding=embeddings,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ = vector_store.add_documents(documents=docs_chunks_semantic)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retriever = vector_store.as_retriever(search_kwargs={\"k\":2})\n",
+    "\n",
+    "def retrieve(state):\n",
+    "    retrieved_docs = retriever.invoke(state[\"question\"])\n",
+    "    return {\"context\":retrieved_docs}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = retrieve({\"question\":\"What is a layer?\"})\n",
+    "[ pp(d.page_content) for d in a[\"context\"] ]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A - Augmentation\n",
+    "\n",
+    "We need to populate a prompt for LLM.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.prompts import ChatPromptTemplate\n",
+    "\n",
+    "SYSTEM_PROMPT = \"\"\"\\\n",
+    "You are a helpful an expert on Photoshop and your goal is to help users\n",
+    "gain knowledge from a database of training videos. \n",
+    "You answer questions based on provided context. \n",
+    "Your answers use emojis for emphasis.\n",
+    "\n",
+    "IMPORTANT: You must only use the provided context, and cannot use your own knowledge.\n",
+    "If there is no context that corresponds to the query, respond by saying\n",
+    "\"I don't know. This is not available in our training library.\"\n",
+    "\n",
+    "Most of the users questions will be in the form:\n",
+    "\"How can I do ...\"\n",
+    "or\n",
+    "\"What is ...\"\n",
+    "\n",
+    "When appropriate, provide your answers in a step-by-step form.\n",
+    "ALWAYS list the URL and the title of the reference video.\n",
+    "NEVER invent the explanation. ALWAYS use ONLY the context information.\n",
+    "\n",
+    "\"\"\"\n",
+    "\n",
+    "RAG_PROMPT=\"\"\"\\\n",
+    "\n",
+    "### Question\n",
+    "{question}\n",
+    "\n",
+    "NEVER invent the explanation. ALWAYS use ONLY the context information.\n",
+    "\n",
+    "### Context\n",
+    "{context}\n",
+    "\n",
+    "\n",
+    "\"\"\"\n",
+    "\n",
+    "rag_prompt = ChatPromptTemplate(\n",
+    "    [(\"system\",SYSTEM_PROMPT), \n",
+    "     (\"human\",RAG_PROMPT)\n",
+    "     ]\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generation\n",
+    "\n",
+    "We will use a 4.1-nano to generate answers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4.1-nano\",temperature=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate(state):\n",
+    "  docs_content = \"\\n\\n\".join(doc.page_content for doc in state[\"context\"])\n",
+    "\n",
+    "  references = [ \n",
+    "                {k: doc.metadata[k] for k in (\"title\",\"source\",\"start\",\"stop\")} \n",
+    "                for doc in state[\"context\"] \n",
+    "  ] \n",
+    "\n",
+    "\n",
+    "  messages = rag_prompt.format_messages(question=state[\"question\"], \n",
+    "                                        context=docs_content)\n",
+    "  response = llm.invoke(messages)\n",
+    "  retval = {\"response\":f\"{response.content}\\n\\n**References**:\\n{json.dumps(references,indent=2)}\",\n",
+    "            \"context\":state[\"context\"]}\n",
+    "  \n",
+    "  return retval\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langgraph.graph import START, StateGraph\n",
+    "from typing_extensions import List, TypedDict,Annotated\n",
+    "from langchain_core.documents import Document\n",
+    "from langchain_core.messages import AIMessage, BaseMessage, HumanMessage\n",
+    "from langchain_openai.chat_models import ChatOpenAI\n",
+    "import operator\n",
+    "\n",
+    "class State(TypedDict):\n",
+    "    question: str\n",
+    "    context: List[Document]\n",
+    "    response: str\n",
+    "        \n",
+    "graph_builder = StateGraph(State).add_sequence([retrieve, generate ])\n",
+    "graph_builder.add_edge(START, \"retrieve\")\n",
+    "graph = graph_builder.compile()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema.output_parser import StrOutputParser\n",
+    "response = graph.invoke({\"question\" : \"What is the layer in Photoshop\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pp(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

pstuts_rag/pstuts_rag.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,5 @@

+Metadata-Version: 2.4
+Name: pstuts_rag
+Version: 0.1
+Summary: PsTuts rag system
+Dynamic: summary

pstuts_rag/pstuts_rag.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+setup.py
+pstuts_rag/__init__.py
+pstuts_rag/loader.py
+pstuts_rag.egg-info/PKG-INFO
+pstuts_rag.egg-info/SOURCES.txt
+pstuts_rag.egg-info/dependency_links.txt
+pstuts_rag.egg-info/not-zip-safe
+pstuts_rag.egg-info/top_level.txt

pstuts_rag/pstuts_rag.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

pstuts_rag/pstuts_rag.egg-info/not-zip-safe ADDED Viewed

	@@ -0,0 +1 @@


1	+

pstuts_rag/pstuts_rag.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pstuts_rag

pstuts_rag/pstuts_rag/__init__.py ADDED Viewed

File without changes

pstuts_rag/pstuts_rag/datastore.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from typing import List, Dict, Iterator
+import functools
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_core.documents import Document
+from pstuts_rag.loader import VideoTranscriptBulkLoader, VideoTranscriptLoader
+from langchain_qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import Distance, VectorParams
+def transcripts_load(
+    json_transcripts: List[Dict],
+    embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model="text-embedding-3-small"),
+) -> List[Document]:
+    """
+    Load and process video transcripts into semantically chunked documents.
+    This function takes a list of transcript dictionaries, loads them as both full
+    transcripts and individual chunks, then applies semantic chunking. It also
+    enriches each semantic chunk with timestamp metadata from the original verbatim chunks.
+    Args:
+        json_transcripts: List of dictionaries containing video transcript data
+        embeddings: OpenAI embeddings model to use for semantic chunking
+    Returns:
+        List of semantically chunked Document objects with enhanced metadata
+    """
+    docs_full_transcript = VideoTranscriptBulkLoader(json_transcripts).load()
+    docs_chunks_verbatim = VideoTranscriptLoader(json_transcripts).load()
+    text_splitter = SemanticChunker(embeddings)
+    docs_chunks_semantic: List[Document] = text_splitter.split_documents(
+        docs_full_transcript
+    )
+    def is_subchunk(a: Document, ofb: Document) -> bool:
+        return (a.metadata["video_id"] == ofb.metadata["video_id"]) and (
+            a.page_content in ofb.page_content
+        )
+    # Create a lookup dictionary for faster access
+    video_id_to_chunks = {}
+    for chunk in docs_chunks_verbatim:
+        video_id = chunk.metadata["video_id"]
+        if video_id not in video_id_to_chunks:
+            video_id_to_chunks[video_id] = []
+        video_id_to_chunks[video_id].append(chunk)
+    for chunk in docs_chunks_semantic:
+        video_id = chunk.metadata["video_id"]
+        # Only check chunks from the same video
+        potential_subchunks = video_id_to_chunks.get(video_id, [])
+        subchunks = [
+            c for c in potential_subchunks if c.page_content in chunk.page_content
+        ]
+        times = [(t.metadata["time_start"], t.metadata["time_end"]) for t in subchunks]
+        chunk.metadata["speech_start_stop_times"] = times
+        if times:  # Avoid IndexError if times is empty
+            chunk.metadata["start"], chunk.metadata["stop"] = times[0][0], times[-1][-1]
+        else:
+            chunk.metadata["start"], chunk.metadata["stop"] = None, None
+    return docs_chunks_semantic
+def initialize_vectorstore(
+    client: QdrantClient, collection_name: str, embeddings: OpenAIEmbeddings
+) -> QdrantVectorStore:
+    client.create_collection(
+        collection_name=collection_name,
+        vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
+    )
+    vector_store = QdrantVectorStore(
+        client=client,
+        collection_name=collection_name,
+        embedding=embeddings,
+    )
+    return vector_store

pstuts_rag/pstuts_rag/loader.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from langchain_core.document_loaders import BaseLoader
+from typing import List, Dict, Iterator
+from langchain_core.documents import Document
+class VideoTranscriptBulkLoader(BaseLoader):
+    """Loads video transcripts as a bulk into documents"""
+    def __init__(self, json_payload: List[Dict]):
+        self.json_payload = json_payload
+    def lazy_load(self) -> Iterator[Document]:
+        """Lazy loader that returns an iterator"""
+        for video in self.json_payload:
+            metadata = dict(video)
+            metadata.pop("transcripts", None)
+            metadata.pop("qa", None)
+            # Rename 'url' key to 'source' in metadata if it exists
+            if "url" in metadata:
+                metadata["source"] = metadata.pop("url")
+            yield Document(
+                page_content="\n".join(t["sent"] for t in video["transcripts"]),
+                metadata=metadata,
+            )
+class VideoTranscriptLoader(BaseLoader):
+    """Loads video transcripts as individual chunks into documents"""
+    def __init__(self, json_payload: List[Dict]):
+        self.json_payload = json_payload
+    def lazy_load(self) -> Iterator[Document]:
+        """Lazy loader that returns an iterator"""
+        for video in self.json_payload:
+            metadata = dict(video)
+            transcripts = metadata.pop("transcripts", None)
+            metadata.pop("qa", None)
+            # Rename 'url' key to 'source' in metadata if it exists
+            if "url" in metadata:
+                metadata["source"] = metadata.pop("url")
+            for transcript in transcripts:
+                yield Document(
+                    page_content=transcript["sent"],
+                    metadata=metadata
+                    | {
+                        "time_start": transcript["begin"],
+                        "time_end": transcript["end"],
+                    },
+                )

pstuts_rag/setup.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from setuptools import setup
+setup(name='pstuts_rag',
+      version='0.1',
+      description='PsTuts rag system',
+      packages=['pstuts_rag'],
+      zip_safe=False)

pyproject.toml CHANGED Viewed

@@ -1,5 +1,5 @@
 [project]
-name = "pstuts-rag"
 version = "2025.05.12"
 description = "Agentic RAG system for PsTuts dataset"
 readme = "README.md"
@@ -33,11 +33,8 @@ dependencies = [
     "unstructured>=0.17.2",
     "uvicorn>=0.25.0,<0.26.0",
     "websockets==14.2",
 ]
-authors = [
-    { name="Marko Budisic", email="mbudisic@gmail.com" }
-    ]
 license = "MIT"
 [build-system]
@@ -45,4 +42,46 @@ requires = ["hatchling >= 1.26"]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
-packages = ["pstuts-rag/pstuts-rag"]

 [project]
+name = "pstuts_rag"
 version = "2025.05.12"
 description = "Agentic RAG system for PsTuts dataset"
 readme = "README.md"
     "unstructured>=0.17.2",
     "uvicorn>=0.25.0,<0.26.0",
     "websockets==14.2",
 ]
+authors = [{ name = "Marko Budisic", email = "mbudisic@gmail.com" }]
 license = "MIT"
 [build-system]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
+packages = ["pstuts_rag/pstuts_rag"]
+# [project.optional-dependencies]
+# dev = [
+#     "pytest>=7.0.0",
+#     "black>=22.0.0",
+#     "flake8>=4.0.0",
+#     "mypy>=0.900",
+# ]
+# [tool.ruff]
+# line-length = 88
+# target-version = "py313"
+# select = ["E", "F", "I", "N", "W"]
+# ignore = []
+# [tool.ruff.isort]
+# known-first-party = ["src"]
+# [tool.black]
+# line-length = 88
+# target-version = ["py313"]
+# [tool.mypy]
+# python_version = "3.13"
+# warn_return_any = true
+# warn_unused_configs = true
+# disallow_untyped_defs = true
+# mypy_path           = ["pstuts_rag/pstuts_rag"]
+# namespace_packages  = true
+# explicit_package_bases = true
+# [tool.flake8]
+# application-import-names = "pstuts_rag"
+# extend-ignore = "E203,W503"
+# [tool.pylint.MASTER]
+# load-plugins      = "pylint_venv"          # optional but handy
+# source-roots      = "pstuts_rag"
+# extension-pkg-allow-list = "numpy, torch"  # compiled deps that astroid cannot parse
+# [tool.pylint.TYPECHECK]
+# ignored-modules   = "pkg_resources"        # suppress noisy vendored imports