Spaces:
Sleeping
Sleeping
File size: 6,293 Bytes
9e5311f fb15c2c 5581c0e 567e76e fb15c2c d328d93 fb15c2c d328d93 5581c0e fb15c2c d328d93 7e6d29a d328d93 0c63336 7e6d29a b8314c0 5581c0e 2a59ecf 0c63336 e46d3a7 f6ca1dd a0710b9 2a59ecf 82f5748 2a59ecf 5581c0e a0ff287 5581c0e a0710b9 b791534 d9493c0 66295de 409e0c0 ef5ad8d e8ca2a7 ef5ad8d 16a5cfc ef5ad8d 86f76a1 ef5ad8d 8089f54 86f76a1 9bd6f1a d3f70ee ee91430 5dff57c c564c53 409e0c0 2cc75db 2860219 e2cef0d 0c63336 09691c8 16a5cfc 409e0c0 86f76a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
from huggingface_hub import InferenceClient
#STEP 1 from Semantic Search (import libraries)
from sentence_transformers import SentenceTransformer
import torch
import gradio as gr
import random
client=InferenceClient("openchat/openchat-3.5-0106")
#STEP 2 from semantic search (read file)
# Open the water_cycle.txt file in read mode with UTF-8 encoding
with open("physics_info.txt", "r", encoding="utf-8") as file:
# Read the entire contents of the file and store it in a variable
physics_info_text = file.read()
# Print the text below
print(physics_info_text)
#Step 3 from Semantic Search (chunk data)
def preprocess_text(text):
# Strip extra whitespace from the beginning and the end of the text
cleaned_text = text.strip()
# Split the cleaned_text by every newline character (\n)
chunks = cleaned_text.split(".")
# Create an empty list to store cleaned chunks
cleaned_chunks = []
# Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
for chunk in chunks:
stripped_chunk = chunk.strip()
if len(stripped_chunk) >= 0:
cleaned_chunks.append(stripped_chunk)
# Print cleaned_chunks
print(cleaned_chunks)
# Print the length of cleaned_chunks
print(len(cleaned_chunks))
# Return the cleaned_chunks
return cleaned_chunks
# Call the preprocess_text function and store the result in a cleaned_chunks variable
cleaned_chunks = preprocess_text(physics_info_text)
# Load the pre-trained embedding model that converts text to vectors
model = SentenceTransformer('all-MiniLM-L6-v2')
#STEP 4 from Semantic Search - (embed chunks)
def create_embeddings(text_chunks):
# Convert each text chunk into a vector embedding and store as a tensor
chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
# Print the chunk embeddings
print(chunk_embeddings)
# Print the shape of chunk_embeddings
print(chunk_embeddings.shape) # no parentheses on .shape because it's a property, not a method! Look up the difference between class methods and classes properties.
# Return the chunk_embeddings
return chunk_embeddings
# Call the create_embeddings function and store the result in a new chunk_embeddings variable
chunk_embeddings = create_embeddings(cleaned_chunks)
#Step 5 from semantic search (find and print top chunks)
# Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
def get_top_chunks(query, chunk_embeddings, text_chunks):
# Convert the query text into a vector embedding
query_embedding = model.encode(query, convert_to_tensor=True)
# Normalize the query embedding to unit length for accurate similarity comparison
query_embedding_normalized = query_embedding / query_embedding.norm()
# Normalize all chunk embeddings to unit length for consistent comparison
chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
# Calculate cosine similarity between query and all chunks using matrix multiplication
similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized)
# Print the similarities
print(similarities)
# Find the indices of the 3 chunks with highest similarity scores
top_indices = torch.topk(similarities, k=3).indices
# Print the top indices
print(top_indices)
# Create an empty list to store the most relevant chunks
top_chunks = []
# Loop through the top indices and retrieve the corresponding text chunks
for index in top_indices:
chunk = text_chunks[index]
top_chunks.append(chunk)
# Return the list of most relevant chunks
return top_chunks
def respond(message, history, name, level):
best_physics_chunks = get_top_chunks(message, chunk_embeddings, cleaned_chunks)
print(best_physics_chunks)
str_physics_chunks = "\n".join(best_physics_chunks)
messages = [
{
"role": "system",
"content": (
"You are a very smart, arrogant professor who knows a lot about physics. "
f"You answer the questions from the user, whose name is {name} directly and concisely as if they were a {level}. Base your response on the provided context."
f"Make sure to use the user's name, {name}, in every response"
f"Speak to the user as though they are a {level} and use appropriate language for them."
"Keep your answers below 100 words!"
"Always finish your response at the end of a sentence"
)
},
{
"role": "user",
"content": (
f"Context:\n{str_physics_chunks}\n\n"
f"Question: {message}"
)
}]
if history:
messages.extend(history)
messages.append(
{"role": "user",
"content": message})
response = client.chat_completion(messages, max_tokens=120)
print(response)
#print("Chat history:" + history)
return response['choices'][0]['message']['content'].strip()
about_text = """
## Use this chatbot to help you with Physics
"""
title = """
# 🧬 Professor PhysicsBot 🧬
"""
with gr.Blocks(theme='mgetz/Celeb_glitzy') as PhysicsBot:
with gr.Row(scale=1):
gr.Image("Professor PhysicsBot.png", show_label = False, show_share_button = False, show_download_button = False)
with gr.Row(scale=5):
with gr.Column(scale=1):
gr.Markdown(title)
gr.Image("CruelRobot.jpg", show_label = False, show_share_button = False, show_download_button = False, width=300, height=300)
gr.Markdown(about_text)
with gr.Column(scale=3):
user_name = gr.Textbox(placeholder="Type your name here", label="Name")
difficulty_level = gr.CheckboxGroup(["baby", "child", "high school student", "Physics Genius"], label="Choose your Physics Level")
gr.ChatInterface(
fn=respond,
additional_inputs=[user_name, difficulty_level],
type="messages")
#chatbot = gr.ChatInterface(respond, type="messages", theme="mgetz/Celeb_glitzy", title="Physics Chatbot", description="Use this chatbot to help you with Physics")
PhysicsBot.launch() |