Spaces:

Leilaaaah
/

BiteBot

Sleeping

App Files Files Community

added step 1-6 from semantic search

by tanyaarora2608 - opened Jul 29, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+113

-0

tanyaarora2608

Jul 29, 2025

Step 1 - Semantic Search

from sentence_transformers import SentenceTransformer
import torch

Step 2 - Semantic Search

# Open the water_cycle.txt file in read mode with UTF-8 encoding

with open("water_cycle.txt", "r", encoding="utf-8") as file:
# Read the entire contents of the file and store it in a variable
water_cycle_text = file.read()
# Print the text below
print(water_cycle_text)

Step 3 - Semantic Search

def preprocess_text(text):

Strip extra whitespace from the beginning and the end of the text

cleaned_text = text.strip()

Split the cleaned_text by every newline character (\n)

chunks = cleaned_text.split("\n")

Create an empty list to store cleaned chunks

cleaned_chunks = []

Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list

for chunk in chunks:
stripped_chunk = chunk.strip()
if len(stripped_chunk) > 0:
cleaned_chunks.append(stripped_chunk)

Print cleaned_chunks

print(cleaned_chunks)

Print the length of cleaned_chunks

print(len(cleaned_chunks))

Return the cleaned_chunks

return cleaned_chunks

Step 4 - Semantic Search

# Load the pre-trained embedding model that converts text to vectors

model = SentenceTransformer('all-MiniLM-L6-v2')

def create_embeddings(text_chunks):
# Convert each text chunk into a vector embedding and store as a tensor
chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list

  # Print the chunk embeddings

print(chunk_embeddings)

  # Print the shape of chunk_embeddings

print(chunk_embeddings.shape)

  # Return the chunk_embeddings

return chunk_embeddings

# Call the create_embeddings function and store the result in a new chunk_embeddings variable

chunk_embeddings = create_embeddings(cleaned_chunks) # Complete this line

# Call the preprocess_text function and store the result in a cleaned_chunks variable

#cleaned_chunks = preprocess_text(water_cycle_text) # Complete this line

Step 5 - Semantic Search

def get_top_chunks(query, chunk_embeddings, text_chunks):

Convert the query text into a vector embedding

query_embedding = model.encode(query, convert_to_tensor = True) # Complete this line

Normalize the query embedding to unit length for accurate similarity comparison

query_embedding_normalized = query_embedding / query_embedding.norm()

Normalize all chunk embeddings to unit length for consistent comparison

chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)

Calculate cosine similarity between query and all chunks using matrix multiplication

similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line

Print the similarities

print(similarities)

Find the indices of the 3 chunks with highest similarity scores

top_indices = torch.topk(similarities, k=3).indices

Print the top indices

print(top_indices)

Create an empty list to store the most relevant chunks

top_chunks = []

Loop through the top indices and retrieve the corresponding text chunks

for i in top_indices:
chunk = text_chunks[i]
top_chunks.append(chunk)

Return the list of most relevant chunks

return top_chunks

Step 6 - Semantic Search

# Call the get_top_chunks function with the original query

top_results = get_top_chunks("How does water get into the sky", chunk_embeddings, cleaned_chunks) # Complete this line
# Print the top results
print(top_results)

added step 1-6 from semantic search37366f32

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

Cannot merge

This branch has merge conflicts in the following files:

app.py

· Sign up or log in to comment