Eoin McGrath commited on
Commit
257dcc1
·
1 Parent(s): ed803e4

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__
3
+ littlejs
4
+ .DS_Store
5
+ venv
6
+ hf_token
README.md CHANGED
@@ -1,13 +1,37 @@
1
  ---
2
  title: AI Project
3
- emoji: 👀
 
4
  colorFrom: yellow
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.22.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: Chat bot with RAG fo LittleJS game engine
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: AI Project
3
+ emoji: 👾
4
+ colorFrom: blue
5
  colorFrom: yellow
6
  colorTo: pink
7
  sdk: gradio
8
  sdk_version: 5.22.0
9
  app_file: app.py
10
  pinned: false
11
+ short_description: Chat bot with RAG for LittleJS game engine
12
  ---
13
 
14
+ # TowardsAI Course final project
15
+
16
+ By Eoin McGrath eoin.mcg@gmail.com
17
+
18
+ ## Title
19
+ Game dev tutor focusing on LittleJS framework
20
+
21
+ ## Overview
22
+ Data gathered from offical docs and github repo source.
23
+ Processed and generated embeddings stored in Chroma DB.
24
+ Evaluation scripts and data provided.
25
+ Reranker used to improve generated answers.
26
+
27
+ ## Optional Extras
28
+ 1. Implement streaming responses.
29
+ 2. The app is designed for a specific goal/domain that is not a tutor about AI: designed for a specific javascript based game engine
30
+ 3. You have shown evidence of collecting at least two data sources beyond those provided in our course:
31
+ fetch_docs.py = output stored in ./data/littlejs_docs.csv
32
+ fetch_repo.py = output stored in ./data/littlejs_repo.csv
33
+ 4 There’s code for RAG evaluation in the folder, and the README contains the evaluation results:
34
+ eval_generate_dataset.py = generates synthetic question context pairs
35
+ eval_process_dataset.py = evaluation results based on above questions
36
+ eval_results.txt = sample saved results
37
+ 5. Use a reranker in your RAG pipeline. It can be a fine-tuned version (your choice): uses LLMRerank postprocessor
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ import gradio as gr
5
+
6
+ from llama_index.core.retrievers import VectorIndexRetriever
7
+ from llama_index.core.llms import MessageRole
8
+ from llama_index.core.memory import ChatSummaryMemoryBuffer
9
+ from llama_index.core.tools import RetrieverTool, ToolMetadata
10
+ from llama_index.agent.openai import OpenAIAgent
11
+ from llama_index.embeddings.openai import OpenAIEmbedding
12
+ from llama_index.llms.openai import OpenAI
13
+ from llama_index.core import Settings
14
+ from llama_index.core.postprocessor import LLMRerank
15
+
16
+ from utils import create_db, load_db, load_asset
17
+ from config import CHROMA_PATH, PLACEHOLDER, TITLE, PROMPT_SYSTEM_MESSAGE, TEXT_QA_TEMPLATE
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+ logging.basicConfig(level=logging.INFO)
22
+ logging.getLogger("httpx").setLevel(logging.WARNING)
23
+
24
+
25
+ API_KEY=""
26
+ token_count = 0
27
+
28
+ def create_knowledge_base_if_not_exists():
29
+ if not os.path.exists(CHROMA_PATH) or not os.listdir(CHROMA_PATH):
30
+ print("⚠️ ChromaDB not found. Creating DB...")
31
+ create_db()
32
+
33
+ def get_tools():
34
+ index = load_db()
35
+ vector_retriever = VectorIndexRetriever(
36
+ index=index,
37
+ similarity_top_k=15,
38
+ embed_model=Settings.embed_model,
39
+ use_async=True,
40
+ )
41
+
42
+ # Add LLMRerank for better retrieval
43
+ reranker = LLMRerank(
44
+ choice_batch_size=5,
45
+ top_n=3,
46
+ )
47
+
48
+ def retrieve_with_rerank(query):
49
+ retrieved_docs = vector_retriever.retrieve(query)
50
+ reranked_docs = reranker.postprocess(retrieved_docs)
51
+ return reranked_docs
52
+
53
+ tools = [
54
+ RetrieverTool(
55
+ # retriever=vector_retriever,
56
+ retriever=retrieve_with_rerank,
57
+ metadata=ToolMetadata(
58
+ name="LitleJS_related_resources",
59
+ description="Useful for info related to the LittleJS game development library. It gathers the info from local data.",
60
+ ),
61
+ )
62
+ ]
63
+ return tools
64
+
65
+ def set_api_key(key):
66
+ API_KEY=key
67
+ Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini", api_key=API_KEY)
68
+ Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
69
+
70
+
71
+ def generate_completion(query, history, memory, api_key):
72
+ logging.info(f"User query: {query}")
73
+
74
+ if not API_KEY:
75
+ set_api_key(api_key)
76
+
77
+ # Manage memory
78
+ chat_list = memory.get()
79
+ if len(chat_list) != 0:
80
+ user_index = [i for i, msg in enumerate(chat_list) if msg.role == MessageRole.USER]
81
+ if len(user_index) > len(history):
82
+ user_index_to_remove = user_index[len(history)]
83
+ chat_list = chat_list[:user_index_to_remove]
84
+ memory.set(chat_list)
85
+ logging.info(f"chat_history: {len(memory.get())} {memory.get()}")
86
+ logging.info(f"gradio_history: {len(history)} {history}")
87
+
88
+ # Create agent
89
+ tools = get_tools()
90
+
91
+ agent = OpenAIAgent.from_tools(
92
+ llm=Settings.llm,
93
+ memory=memory,
94
+ tools=tools,
95
+ system_prompt=PROMPT_SYSTEM_MESSAGE,
96
+ )
97
+
98
+ # Generate answer
99
+ completion = agent.stream_chat(query)
100
+ answer_str = ""
101
+ for token in completion.response_gen:
102
+ answer_str += token
103
+ global token_count
104
+ token_count += 1 # Update token count
105
+ yield answer_str
106
+
107
+
108
+
109
+ def launch_ui():
110
+
111
+ js=load_asset("./assets/chat.js")
112
+
113
+ with gr.Blocks(
114
+ title=TITLE,
115
+ fill_height=True,
116
+ analytics_enabled=True,
117
+ css=load_asset("./assets/style.css"),
118
+ js=load_asset("./assets/chat.js"),
119
+ ) as demo:
120
+
121
+ api_key_input = gr.Textbox(
122
+ label="Enter your OpenAI API Key",
123
+ type="password",
124
+ placeholder="sk-...",
125
+ elem_classes="api_key_input"
126
+ )
127
+
128
+ memory_state = gr.State(
129
+ lambda: ChatSummaryMemoryBuffer.from_defaults(
130
+ token_limit=120000,
131
+ )
132
+ )
133
+ chatbot = gr.Chatbot(
134
+ scale=1,
135
+ placeholder=PLACEHOLDER,
136
+ type='messages',
137
+ show_label=False,
138
+ show_copy_button=True,
139
+ elem_classes="chatbox",
140
+ )
141
+
142
+ gr.ChatInterface(
143
+ fn=generate_completion,
144
+ chatbot=chatbot,
145
+ type='messages',
146
+ additional_inputs=[memory_state, api_key_input],
147
+ )
148
+
149
+ token_counter = gr.Button("Tokens Used: 0", elem_classes="token_counter")
150
+
151
+ demo.queue(default_concurrency_limit=64)
152
+ demo.launch(debug=True, favicon_path="./assets/favicon.png", share=False) # Set share=True to share the app online
153
+
154
+
155
+ if __name__ == "__main__":
156
+ create_knowledge_base_if_not_exists()
157
+ launch_ui()
assets/chat.js ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ function createGradioAnimation() {
2
+ var container = document.createElement('div');
3
+ container.id = 'gradio-animation';
4
+ container.style.fontSize = '2em';
5
+ container.style.fontWeight = 'bold';
6
+ container.style.textAlign = 'center';
7
+ container.style.marginBottom = '20px';
8
+
9
+ var text = 'LittleJS Game Tutor 👾';
10
+ for (var i = 0; i < text.length; i++) {
11
+ (function(i){
12
+ setTimeout(function(){
13
+ var letter = document.createElement('span');
14
+ letter.style.opacity = '0';
15
+ letter.style.transition = 'opacity 0.5s';
16
+ letter.innerText = text[i];
17
+
18
+ container.appendChild(letter);
19
+
20
+ setTimeout(function() {
21
+ letter.style.opacity = '1';
22
+ }, 20);
23
+ }, i * 50);
24
+ })(i);
25
+ }
26
+
27
+ var gradioContainer = document.querySelector('.gradio-container');
28
+ gradioContainer.insertBefore(container, gradioContainer.firstChild);
29
+
30
+
31
+ const showChat = () => {
32
+ header.style.display = 'none';
33
+ chatbox.style.display = 'block';
34
+ chatinput.style.display = 'block'
35
+ token_counter.style.display = 'none';
36
+ }
37
+
38
+ const header = container;
39
+ const input = document.querySelector('.api_key_input');
40
+ input.focus();
41
+ const chatbox = document.querySelector('.chatbox');
42
+ const chatinput = document.querySelectorAll('.form')[1];
43
+ const token_counter = document.querySelector('.token_counter');
44
+
45
+
46
+ chatbox.style.display = 'none';
47
+ chatinput.style.display = 'none';
48
+ token_counter.style.display = 'none';
49
+
50
+ input.addEventListener(('blur'), (e) => {
51
+ showChat();
52
+ })
53
+ input.addEventListener(('paste'), (e) => {
54
+ showChat();
55
+ })
56
+
57
+ return 'Animation created';
58
+ }
59
+
60
+
assets/favicon.png ADDED
assets/style.css ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ footer { display: none !important; }
2
+ /* Custom CSS for LittleJS Chatbot */
3
+
4
+ .placeholder { text-align: center; }
5
+ .hide { display: none; }
6
+
7
+ /* Main container styling */
8
+ body {
9
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
10
+ }
11
+
12
+ /* API Key input styling */
13
+ .api-key-container {
14
+ max-width: 600px;
15
+ margin: 50px auto;
16
+ padding: 20px;
17
+ border-radius: 10px;
18
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
19
+ background-color: #f8f9fa;
20
+ }
21
+
22
+ /* Chat container */
23
+ .chat-container {
24
+ border-radius: 10px;
25
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
26
+ background-color: #ffffff;
27
+ }
28
+
29
+ /* Message bubbles */
30
+ .message-bubble {
31
+ padding: 12px 16px;
32
+ border-radius: 16px;
33
+ max-width: 85%;
34
+ margin-bottom: 10px;
35
+ }
36
+
37
+ .user-message {
38
+ background-color: #e3f2fd;
39
+ align-self: flex-end;
40
+ }
41
+
42
+ .bot-message {
43
+ background-color: #f1f3f4;
44
+ align-self: flex-start;
45
+ }
46
+
47
+ /* Token display */
48
+ .token-display {
49
+ background-color: #f8f9fa;
50
+ border-radius: 8px;
51
+ padding: 10px;
52
+ font-size: 0.9em;
53
+ border-left: 4px solid #4285f4;
54
+ }
55
+
56
+ /* Buttons */
57
+ .primary-button {
58
+ background-color: #4285f4;
59
+ color: white;
60
+ border: none;
61
+ border-radius: 4px;
62
+ padding: 8px 16px;
63
+ cursor: pointer;
64
+ transition: background-color 0.2s;
65
+ }
66
+
67
+ .primary-button:hover {
68
+ background-color: #3367d6;
69
+ }
70
+
71
+ .secondary-button {
72
+ background-color: #f1f3f4;
73
+ color: #202124;
74
+ border: 1px solid #dadce0;
75
+ border-radius: 4px;
76
+ padding: 8px 16px;
77
+ cursor: pointer;
78
+ transition: background-color 0.2s;
79
+ }
80
+
81
+ .secondary-button:hover {
82
+ background-color: #e8eaed;
83
+ }
84
+
85
+ /* Code blocks */
86
+ pre {
87
+ background-color: #f5f7f9;
88
+ border-radius: 6px;
89
+ padding: 12px;
90
+ overflow-x: auto;
91
+ }
92
+
93
+ code {
94
+ font-family: 'Courier New', Courier, monospace;
95
+ font-size: 0.9em;
96
+ }
97
+
98
+ /* Error message */
99
+ .error-message {
100
+ color: #d32f2f;
101
+ font-size: 0.9em;
102
+ margin-top: 8px;
103
+ }
config.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE="LittleJS Game Tutor 👾"
2
+ PLACEHOLDER="<h1>LittleJS Game Tutor 👾</h1><br> A Question-Answering Bot for anything LittleJS related</strong><br>"
3
+ CHROMA_PATH="littlejs"
4
+ CHROMA_COLLECTION="littlejs"
5
+ CHUNK_SIZE=512
6
+ CHUNK_OVERLAP=128
7
+ FILES=["./data/littlejs_docs.csv", "./data/littlejs_repo.csv"]
8
+ # FILES=["./data/littlejs_docs.csv"]
9
+
10
+ PROMPT_SYSTEM_MESSAGE = """You are a programming teacher, answering questions from students of a course on game development using the LittleJS library.
11
+ Topics covered include installing, set up, creating games with LittleJS, debugging techniques etc. Questions should be understood in this context. Your answers are aimed to teach
12
+ students, so they should be complete, clear, and easy to understand. Use the available tools to gather insights pertinent to game development with LittleJS.
13
+ To find relevant information for answering student questions, always use the "LitleJS_related_resources" tool.
14
+
15
+ Only some information returned by the tool might be relevant to the question, so ignore the irrelevant part and answer the question with what you have. Your responses are exclusively based on the output provided
16
+ by the tools. Refrain from incorporating information not directly obtained from the tool's responses.
17
+ If a user requests further elaboration on a specific aspect of a previously discussed topic, you should reformulate your input to the tool to capture this new angle or more profound layer of inquiry. Provide
18
+ comprehensive answers, ideally structured in multiple paragraphs, drawing from the tool's variety of relevant details. Provide code samples where possible. The depth and breadth of your responses should align with the scope and specificity of the information retrieved.
19
+ Should the tool response lack information on the queried topic, politely inform the user that the question transcends the bounds of your current knowledge base, citing the absence of relevant content in the tool's documentation.
20
+ At the end of your answers, always invite the students to ask deeper questions about the topic if they have any.
21
+ Do not refer to the documentation directly, but use the information provided within it to answer questions. If code is provided in the information, share it with the students. It's important to provide complete code blocks so
22
+ they can execute the code when they copy and paste them. Make sure to format your answers in Markdown format, including code blocks and snippets.
23
+ """
24
+
25
+ TEXT_QA_TEMPLATE = """
26
+ You must answer only related to LittleJS, game development and related concepts queries.
27
+ Always leverage the retrieved documents to answer the questions, don't answer them on your own.
28
+ If the query is not relevant to LittleJS, say that you don't know the answer.
29
+ """
data/extra_resources.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ https://raw.githubusercontent.com/KilledByAPixel/LittleJS/refs/heads/main/FAQ.mhttps://raw.githubusercontent.com/KilledByAPixel/LittleJS/refs/heads/main/FAQ.md
2
+ https://raw.githubusercontent.com/KilledByAPixel/LittleJS/refs/heads/main/examples/breakoutTutorial/README.md
data/littlejs_docs.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/littlejs_repo.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/littlejsdocs.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://killedbyapixel.github.io/LittleJS/docs/
2
+ https://killedbyapixel.github.io/LittleJS/docs/Color.html
3
+ https://killedbyapixel.github.io/LittleJS/docs/EngineObject.html
4
+ https://killedbyapixel.github.io/LittleJS/docs/FontImage.html
5
+ https://killedbyapixel.github.io/LittleJS/docs/Medal.html
6
+ https://killedbyapixel.github.io/LittleJS/docs/Music.html
7
+ https://killedbyapixel.github.io/LittleJS/docs/Particle.html
8
+ https://killedbyapixel.github.io/LittleJS/docs/ParticleEmitter.html
9
+ https://killedbyapixel.github.io/LittleJS/docs/RandomGenerator.html
10
+ https://killedbyapixel.github.io/LittleJS/docs/Sound.html
11
+ https://killedbyapixel.github.io/LittleJS/docs/SoundWave.html
12
+ https://killedbyapixel.github.io/LittleJS/docs/TextureInfo.html
13
+ https://killedbyapixel.github.io/LittleJS/docs/TileInfo.html
14
+ https://killedbyapixel.github.io/LittleJS/docs/TileLayer.html
15
+ https://killedbyapixel.github.io/LittleJS/docs/TileLayerData.html
16
+ https://killedbyapixel.github.io/LittleJS/docs/Timer.html
17
+ https://killedbyapixel.github.io/LittleJS/docs/Vector2.html
18
+ https://killedbyapixel.github.io/LittleJS/docs/Audio.html
19
+ https://killedbyapixel.github.io/LittleJS/docs/Debug.html
20
+ https://killedbyapixel.github.io/LittleJS/docs/Draw.html
21
+ https://killedbyapixel.github.io/LittleJS/docs/Engine.html
22
+ https://killedbyapixel.github.io/LittleJS/docs/Input.html
23
+ https://killedbyapixel.github.io/LittleJS/docs/Medals.html
24
+ https://killedbyapixel.github.io/LittleJS/docs/Random.html
25
+ https://killedbyapixel.github.io/LittleJS/docs/Settings.html
26
+ https://killedbyapixel.github.io/LittleJS/docs/TileCollision.html
27
+ https://killedbyapixel.github.io/LittleJS/docs/Utilities.html
28
+ https://killedbyapixel.github.io/LittleJS/docs/WebGL.html
eval_generate_dataset.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from dotenv import load_dotenv
4
+
5
+ from llama_index.core import Settings
6
+ from llama_index.embeddings.openai import OpenAIEmbedding
7
+
8
+ from llama_index.llms.openai import OpenAI
9
+ from llama_index.llms.gemini import Gemini
10
+
11
+ from llama_index.core.evaluation import generate_question_context_pairs
12
+ from llama_index.core.evaluation import RetrieverEvaluator
13
+ from llama_index.core.query_engine import RetrieverQueryEngine
14
+
15
+ from utils import create_db
16
+ from config import CHROMA_PATH, CHROMA_COLLECTION
17
+
18
+ load_dotenv()
19
+
20
+ Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512)
21
+ Settings.embed_model = OpenAIEmbedding(
22
+ model="text-embedding-3-small"
23
+ )
24
+
25
+ nodes = create_db(return_nodes=True)
26
+
27
+ # Free Tier-Gemini API key
28
+ from llama_index.core.llms.utils import LLM
29
+ from llama_index.core.schema import MetadataMode, TextNode
30
+ from tqdm import tqdm
31
+ import json
32
+ import re
33
+ import uuid
34
+ import warnings
35
+ import time
36
+ from typing import Dict, List, Tuple
37
+ from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
38
+
39
+ DEFAULT_QA_GENERATE_PROMPT_TMPL = """\
40
+ Context information is below.
41
+
42
+ ---------------------
43
+ {context_str}
44
+ ---------------------
45
+
46
+ Given the context information and not prior knowledge.
47
+ generate only questions based on the below query.
48
+
49
+ You are a Teacher/ Professor. Your task is to setup \
50
+ {num_questions_per_chunk} questions for an upcoming \
51
+ quiz/examination. The questions should be diverse in nature \
52
+ across the document. Restrict the questions to the \
53
+ context information provided."
54
+ """
55
+
56
+ def generate_question_context_pairs(
57
+ nodes: List[TextNode],
58
+ llm: LLM,
59
+ qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,
60
+ num_questions_per_chunk: int = 2,
61
+ request_delay: float = 2.0
62
+ ) -> EmbeddingQAFinetuneDataset:
63
+ """Generate examples given a set of nodes with delays between requests."""
64
+ node_dict = {
65
+ node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
66
+ for node in nodes
67
+ }
68
+
69
+ queries = {}
70
+ relevant_docs = {}
71
+
72
+ for node_id, text in tqdm(node_dict.items()):
73
+ query = qa_generate_prompt_tmpl.format(
74
+ context_str=text, num_questions_per_chunk=num_questions_per_chunk
75
+ )
76
+ response = llm.complete(query)
77
+
78
+ result = str(response).strip().split("\n")
79
+ questions = [
80
+ re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
81
+ ]
82
+ questions = [question for question in questions if len(question) > 0][
83
+ :num_questions_per_chunk
84
+ ]
85
+
86
+ num_questions_generated = len(questions)
87
+ if num_questions_generated < num_questions_per_chunk:
88
+ warnings.warn(
89
+ f"Fewer questions generated ({num_questions_generated}) "
90
+ f"than requested ({num_questions_per_chunk})."
91
+ )
92
+
93
+ for question in questions:
94
+ question_id = str(uuid.uuid4())
95
+ queries[question_id] = question
96
+ relevant_docs[question_id] = [node_id]
97
+
98
+ time.sleep(request_delay)
99
+
100
+ return EmbeddingQAFinetuneDataset(
101
+ queries=queries, corpus=node_dict, relevant_docs=relevant_docs
102
+ )
103
+
104
+ #from llama_index.core.evaluation import generate_question_context_pairs
105
+ from llama_index.llms.gemini import Gemini
106
+
107
+ llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512)
108
+
109
+ rag_eval_dataset = generate_question_context_pairs(
110
+ nodes[:25],
111
+ llm=llm,
112
+ num_questions_per_chunk=1,
113
+ request_delay=4
114
+ )
115
+
116
+ # Save the dataset as a json file for later use
117
+ rag_eval_dataset.save_json("./rag_eval_dataset.json")
eval_process_dataset.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from dotenv import load_dotenv
3
+ from llama_index.core import Settings
4
+ from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
5
+
6
+ from llama_index.core.evaluation import RetrieverEvaluator
7
+ from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
8
+ from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner
9
+ from llama_index.llms.openai import OpenAI
10
+ from utils import load_db # Assuming utils.py is in the same directory
11
+
12
+ from llama_index.llms.openai import OpenAI
13
+ from llama_index.embeddings.openai import OpenAIEmbedding
14
+
15
+ import asyncio
16
+ import nest_asyncio
17
+
18
+ # Apply nest_asyncio to allow nested event loops
19
+ nest_asyncio.apply()
20
+
21
+ Settings.embed_model = OpenAIEmbedding(
22
+ model="text-embedding-ada-002",
23
+ timeout=60, # Increase timeout to 60 seconds
24
+ max_retries=5, # Increase max retries
25
+ retry_delay=2 # Wait 2 seconds between retries
26
+ )
27
+ # Create a debug handler to see more info about API calls
28
+ # debug_handler = LlamaDebugHandler(print_trace_on_end=True)
29
+ # callback_manager = CallbackManager([debug_handler])
30
+
31
+ # Configure global settings
32
+ # Settings.callback_manager = callback_manager
33
+ Settings.retry_policy = {
34
+ "max_retries": 5,
35
+ "retry_delay": 2.0,
36
+ "exponential_backoff": True
37
+ }
38
+ async def run_evaluation():
39
+ load_dotenv()
40
+ index = load_db()
41
+ rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./rag_eval_dataset.json")
42
+ # Define an LLM as a judge
43
+ llm_gpt4o = OpenAI(temperature=0, model="gpt-4o")
44
+ llm_gpt4o_mini = OpenAI(temperature=0, model="gpt-4o-mini")
45
+ # Initiate the faithfulnes and relevancy evaluator objects
46
+ faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_gpt4o)
47
+ relevancy_evaluator = RelevancyEvaluator(llm=llm_gpt4o)
48
+ # Extract the questions from the dataset
49
+ queries = list(rag_eval_dataset.queries.values())
50
+ # Limit to first 20 question to save time (!!remove this line in production!!)
51
+ batch_eval_queries = queries[:5]
52
+ # The batch evaluator runs the evaluation in batches
53
+ runner = BatchEvalRunner(
54
+ {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
55
+ workers=8,
56
+ )
57
+ # Define a for-loop to try different `similarity_top_k` values
58
+ for i in [2,4,6]:
59
+ # Set query engine with different number of returned chunks
60
+ query_engine = index.as_query_engine(similarity_top_k=i, llm=llm_gpt4o_mini)
61
+ # Run the evaluation
62
+ eval_results = await runner.aevaluate_queries(query_engine, queries=batch_eval_queries)
63
+ # Printing the results
64
+ faithfulness_score = sum(
65
+ result.passing for result in eval_results["faithfulness"]
66
+ ) / len(eval_results["faithfulness"])
67
+ print(f"top_{i} faithfulness_score: {faithfulness_score}")
68
+ relevancy_score = sum(result.passing for result in eval_results["relevancy"]) / len(
69
+ eval_results["relevancy"]
70
+ )
71
+ print(f"top_{i} relevancy_score: {relevancy_score}")
72
+ print("="*15)
73
+
74
+ # Run the async function
75
+ if __name__ == "__main__":
76
+ asyncio.run(run_evaluation())
eval_results.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ top_2 faithfulness_score: 0.4
2
+ top_2 relevancy_score: 0.6
3
+ ===============
4
+ top_4 faithfulness_score: 0.5
5
+ top_4 relevancy_score: 0.9
6
+ ===============
7
+ top_6 faithfulness_score: 0.4
8
+ top_6 relevancy_score: 0.75
9
+ ===============
fetch_docs.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+
5
+ with open("./data/littlejsdocs.txt") as file:
6
+ urls = file.readlines()
7
+ # Remove trailing newline characters
8
+ urls = [url.rstrip('\n') for url in urls if not url.startswith('#')]
9
+
10
+ def parse_webpage(url: str):
11
+ html = requests.get(url).text
12
+ soup = BeautifulSoup(html, "html.parser")
13
+ title = soup.find("title").get_text()
14
+ text = soup.find("div", class_="main-wrapper")
15
+ text.find('footer').extract()
16
+ return [
17
+ title,
18
+ text.get_text(),
19
+ url
20
+ ]
21
+
22
+ docs = []
23
+ for url in urls:
24
+ docs.append(parse_webpage(url))
25
+
26
+ with open('./data/littledocs.csv', 'w', newline='') as file:
27
+ writer = csv.writer(file)
28
+ field = ["title", "text", "url"]
29
+
30
+ writer.writerow(field)
31
+ for line in docs:
32
+ writer.writerow([line[0], line[1], line[2]])
33
+
34
+ print('DONE')
fetch_repo.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import requests
4
+ import csv
5
+ from urllib.parse import urljoin
6
+
7
+ from llama_index.core import Document
8
+
9
+ FILE="repo_files.csv"
10
+
11
+ def get_github_repo_files(repo_url):
12
+ """
13
+ Fetches all JavaScript files in the 'src' directory of a GitHub repository.
14
+
15
+ Args:
16
+ repo_url (str): The URL of the GitHub repository.
17
+
18
+ Returns:
19
+ list: A list of dictionaries, each containing file information (filename, source, url).
20
+ """
21
+ try:
22
+ repo_name = repo_url.split("github.com/")[1].rstrip("/")
23
+ api_url = f"https://api.github.com/repos/{repo_name}/contents/src"
24
+ response = requests.get(api_url)
25
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
26
+ files = response.json()
27
+ js_files = []
28
+
29
+ for file in files:
30
+ if file["type"] == "file" and file["name"].endswith(".js"):
31
+ file_info = {
32
+ "filename": file["name"],
33
+ "url": file["download_url"],
34
+ }
35
+ file_content_response = requests.get(file["download_url"])
36
+ file_content_response.raise_for_status()
37
+ file_info["source"] = file_content_response.text
38
+ js_files.append(file_info)
39
+
40
+ return js_files
41
+
42
+ except requests.exceptions.RequestException as e:
43
+ print(f"Error fetching repository files: {e}")
44
+ return []
45
+ except IndexError:
46
+ print("Invalid GitHub repository URL format.")
47
+ return []
48
+ except KeyError:
49
+ print("src directory not found in the repository")
50
+ return []
51
+ except Exception as e:
52
+ print(f"An unexpected error occured: {e}")
53
+ return []
54
+
55
+ def write_to_csv(data, output_file=FILE):
56
+ """
57
+ Writes file information to a CSV file.
58
+
59
+ Args:
60
+ data (list): A list of dictionaries, each containing file information.
61
+ output_file (str): The name of the output CSV file.
62
+ """
63
+ try:
64
+ with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
65
+ fieldnames = ["filename", "source", "url"]
66
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
67
+
68
+ writer.writeheader()
69
+ for file_info in data:
70
+ writer.writerow(file_info)
71
+ print(f"File information written to {output_file}")
72
+ except Exception as e:
73
+ print(f"Error writing to CSV: {e}")
74
+
75
+ def create_csv():
76
+ """
77
+ Main function to parse arguments and process the repository.
78
+ """
79
+ parser = argparse.ArgumentParser(description="Fetch JavaScript files from a GitHub repository's src directory.")
80
+ parser.add_argument("repo_url", help="The GitHub repository URL.")
81
+
82
+ repo_url = "https://github.com/KilledByAPixel/LittleJS"
83
+
84
+ files = get_github_repo_files(repo_url)
85
+ if files:
86
+ write_to_csv(files)
87
+
88
+
89
+ print('READY?')
90
+
91
+ rows = []
92
+ # Load the file as a JSON
93
+ with open(FILE, mode="r", encoding="utf-8") as file:
94
+ csv_reader = csv.reader(file)
95
+
96
+ for idx, row in enumerate(csv_reader):
97
+ if idx == 0: continue; # Skip header row
98
+ rows.append(row)
99
+
100
+ # Convert the chunks to Document objects so the LlamaIndex framework can process them.
101
+ documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2]}) for row in rows]
102
+ print(len(documents))
103
+ print(documents[0].metadata)
104
+ print(documents[0])
105
+
106
+
optional.txt ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1. [X] Implement streaming responses.
2
+
3
+ ?? Uses dynamic few-shot prompting, where the best examples are selected according to the user query.
4
+
5
+ 2. [] There’s code for RAG evaluation in the folder, and the README contains the evaluation results.
6
+ The folder must also contain the evaluation dataset and the evaluation scripts.
7
+ https://academy.towardsai.net/courses/take/beginner-to-advanced-llm-dev/multimedia/59791119-evaluating-your-rag-pipeline
8
+
9
+ 3. [X] The app is designed for a specific goal/domain that is not a tutor about AI. For example, it could be about finance, healthcare, etc.
10
+
11
+ Use live search results. In this case, the user must also input the API keys for the relevant APIs (e.g., Perplexity, Bing search, etc.).
12
+
13
+ 4. [X] You have shown evidence of collecting at least two data sources beyond those provided in our course.
14
+
15
+ Your data collection and curation process leverages structured JSON outputs, which are used for advanced RAG functionalities in your app.
16
+
17
+ Your data collection and curation process leverages images and/or PDFs. The parsed data should be useful for some use cases in your app.
18
+
19
+ The app can generate images on the fly. They must be pertinent to the scope of your app.
20
+
21
+ Use a reranker in your RAG pipeline. It can be a fine-tuned version (your choice).
22
+
23
+ Use hybrid search in your RAG pipeline.
24
+
25
+ 5. [ ] Use metadata filtering.
26
+
27
+ Use a fine-tuned LLM in your app.
28
+
29
+ Use a fine-tuned embedding model in your app.
30
+
31
+ Your RAG pipeline includes query routing.
32
+ https://academy.towardsai.net/courses/take/beginner-to-advanced-llm-dev/multimedia/59791736-adding-question-validation-and-routing
33
+
34
+ Your query pipeline includes function calling.
35
+
36
+ Your app manages speech inputs.
37
+
38
+ Your app manages speech outputs.
39
+
40
+ Your app appropriately leverages context caching (also known as prompt caching). Explain in the README how you designed your prompts to be adequate for context caching.
41
+ https://academy.towardsai.net/courses/take/beginner-to-advanced-llm-dev/multimedia/59791739-long-context-llms-context-caching-vs-rag
rag_eval_dataset.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4==4.13.3
2
+ chromadb==0.6.3
3
+ gradio==5.22.0
4
+ huggingface_hub==0.29.3
5
+ llama_index==0.12.25
6
+ nest_asyncio==1.6.0
7
+ openai==1.68.0
8
+ pandas==2.2.3
9
+ python-dotenv==1.0.1
10
+ Requests==2.32.3
11
+ tiktoken==0.9.0
12
+ tqdm==4.67.1
test_query.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ from llama_index.core import Settings
5
+ from llama_index.embeddings.openai import OpenAIEmbedding
6
+
7
+ from llama_index.llms.openai import OpenAI
8
+ from llama_index.llms.gemini import Gemini
9
+
10
+ from utils import load_db, create_db
11
+ from config import CHROMA_PATH, CHROMA_COLLECTION, FILES, CHUNK_SIZE, CHUNK_OVERLAP
12
+
13
+ load_dotenv()
14
+
15
+ Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini")
16
+ Settings.embed_model = OpenAIEmbedding(
17
+ model="text-embedding-3-small"
18
+ )
19
+
20
+ if not os.path.exists(CHROMA_PATH) or not os.listdir(CHROMA_PATH):
21
+ print("⚠️ ChromaDB not found. Creating DB...")
22
+ index = create_db()
23
+ else:
24
+ print("✅ ChromaDB found. Loading DB...", CHROMA_PATH, CHROMA_COLLECTION)
25
+ index = load_db()
26
+
27
+
28
+ q="What is LittleJS?"
29
+ q="How to install?"
30
+ q="Who is the author?"
31
+ # q="How to boil an egg?"
32
+ # q="What params does the EngineObject require?"
33
+ # q="What is vec2?"
34
+ query_engine = index.as_query_engine(llm=Settings.llm, similarity_top_k=5)
35
+ res = query_engine.query(q)
36
+
37
+ print(res.response)
utils.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import csv
4
+ import hashlib
5
+ from dotenv import load_dotenv
6
+
7
+ import chromadb
8
+ from llama_index.core import Settings
9
+ from llama_index.core import Document
10
+ from llama_index.core.schema import BaseNode
11
+ from llama_index.core.node_parser import TokenTextSplitter
12
+ from llama_index.embeddings.openai import OpenAIEmbedding
13
+ from llama_index.core.ingestion import IngestionPipeline
14
+ from llama_index.core import VectorStoreIndex
15
+ from llama_index.vector_stores.chroma import ChromaVectorStore
16
+ from llama_index.embeddings.openai import OpenAIEmbedding
17
+ from llama_index.llms.openai import OpenAI
18
+
19
+ from config import CHROMA_PATH, CHROMA_COLLECTION, FILES, CHUNK_SIZE, CHUNK_OVERLAP
20
+
21
+ Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini")
22
+ Settings.embed_model = OpenAIEmbedding(
23
+ model="text-embedding-3-small"
24
+ )
25
+
26
+ load_dotenv()
27
+
28
+ def deterministic_id_func(i: int, doc: BaseNode) -> str:
29
+ """Deterministic ID function for the text splitter.
30
+ This will be used to generate a unique repeatable identifier for each node."""
31
+ unique_identifier = doc.id_ + str(i)
32
+ hasher = hashlib.sha256()
33
+ hasher.update(unique_identifier.encode('utf-8'))
34
+ return hasher.hexdigest()
35
+
36
+ def create_db(return_nodes=False):
37
+ rows = []
38
+ # Load the file as a JSON
39
+ for FILE in FILES:
40
+ with open(FILE, mode="r", encoding="utf-8") as file:
41
+ csv_reader = csv.reader(file)
42
+
43
+ for idx, row in enumerate(csv_reader):
44
+ if idx == 0: continue # Skip header row
45
+ rows.append(row)
46
+
47
+ # Convert the chunks to Document objects so the LlamaIndex framework can process them.
48
+ documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2]}) for row in rows]
49
+ # By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.
50
+ for idx, doc in enumerate(documents):
51
+ doc.id_ = f"doc_{idx}"
52
+
53
+ # Define the splitter object that split the text into segments with 512 tokens,
54
+ # with a 128 overlap between the segments.
55
+ text_splitter = TokenTextSplitter(
56
+ separator=" ", chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP,
57
+ id_func=deterministic_id_func
58
+ )
59
+
60
+ chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
61
+ chroma_collection = chroma_client.get_or_create_collection(CHROMA_COLLECTION)
62
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
63
+ # Create the pipeline to apply the transformation (splitting and embedding) on each chunk,
64
+ # and store the transformed text in the chroma vector store.
65
+ pipeline = IngestionPipeline(
66
+ transformations=[
67
+ text_splitter,
68
+ OpenAIEmbedding(model = 'text-embedding-3-small'),
69
+ ],
70
+ vector_store=vector_store
71
+ )
72
+
73
+ # Run the transformation pipeline.
74
+ nodes = pipeline.run(documents=documents, show_progress=True)
75
+
76
+ db = chromadb.PersistentClient(path=CHROMA_PATH)
77
+ chroma_collection = db.get_or_create_collection(CHROMA_COLLECTION)
78
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
79
+
80
+ index = VectorStoreIndex.from_vector_store(vector_store)
81
+ if return_nodes:
82
+ return nodes
83
+ else:
84
+ return index
85
+
86
+
87
+ def load_db():
88
+ chroma_client = chromadb.PersistentClient(CHROMA_PATH)
89
+ chroma_collection = chroma_client.get_collection(CHROMA_COLLECTION)
90
+
91
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
92
+
93
+ index = VectorStoreIndex.from_vector_store(
94
+ vector_store=vector_store,
95
+ show_progress=True,
96
+ use_async=True,
97
+ embed_model=Settings.embed_model
98
+ )
99
+
100
+ return index
101
+
102
+ def load_asset(file):
103
+ """Load CSS from an external file"""
104
+ if os.path.exists(file):
105
+ with open(file, "r", encoding="utf-8") as f:
106
+ return f.read()
107
+
108
+ def num_tokens_from_messages(messages, model="gpt-4"):
109
+ """Return the number of tokens used by a list of messages."""
110
+ try:
111
+ encoding = tiktoken.encoding_for_model(model)
112
+ except KeyError:
113
+ encoding = tiktoken.get_encoding("cl100k_base")
114
+
115
+ tokens_per_message = 3
116
+ tokens_per_name = 1
117
+
118
+ num_tokens = 0
119
+ for message in messages:
120
+ num_tokens += tokens_per_message
121
+ for key, value in message.items():
122
+ num_tokens += len(encoding.encode(value))
123
+ if key == "name":
124
+ num_tokens += tokens_per_name
125
+ num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
126
+
127
+ return num_tokens