Spaces:

iaravagni
/

LLMs_Assignment_3

Sleeping

App Files Files Community

iaravagni commited on Mar 5, 2025

Commit

6c1417f

1 Parent(s): 3fc053b

chunk size modification

Browse files

Files changed (2) hide show

app.py +25 -9
embeddings.csv +0 -0

app.py CHANGED Viewed

@@ -23,11 +23,22 @@ def clean_text(text):
     text = text.replace(r"\'", "'")
     return text
-def chunk_text(text):
-    clean = clean_text(text)
-    paragraphs = re.split(r'\n', clean)
-    paragraphs = [p.strip() for p in paragraphs if p.strip()]
-    return paragraphs
 def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
     model = SentenceTransformer(model_name)
@@ -35,7 +46,7 @@ def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
     return embeddings
 def store_in_database(chunks, embeddings):
-    with open("embeddings.csv", "w", newline="") as f:
         writer = csv.writer(f)
         writer.writerow(["text", "embedding"])
         for chunk, embedding in zip(chunks, embeddings):
@@ -63,7 +74,7 @@ def load_from_database(filepath):
             embeddings.append(embedding)
     return chunks, np.array(embeddings)
-def semantic_search(queryEmbedding, topK=3):
     dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
     similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
     topIndex = np.argsort(similarities)[-topK:][::-1]
@@ -72,7 +83,12 @@ def semantic_search(queryEmbedding, topK=3):
 def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
     prompt = f"""
-        You are an AI assistant answering a user's query based on retrieved knowledge.
         Context:
         {retrievedContext}
@@ -108,7 +124,7 @@ iface = gr.Interface(
     ],
     outputs="text",
     live=False,  # Disable live updates
-    title="RAG App system",  # Title of the app
     description="Upload a PDF and ask a question to extract information from it.",  # Optional description
     allow_flagging="never",
 )

     text = text.replace(r"\'", "'")
     return text
+def chunk_text(text, chunk_size=500, overlap=100):
+    clean = clean_text(text)  # Ensure text is preprocessed
+    words = clean.split()  # Split by words to avoid breaking mid-word
+    chunks = []
+    start = 0  # Start index for chunking
+    while start < len(words):
+        end = start + chunk_size  # Define chunk endpoint
+        chunk = " ".join(words[start:end])  # Get words within the chunk
+        chunks.append(chunk.strip())  # Strip extra spaces
+        start += chunk_size - overlap  # Move start forward with overlap
+    return chunks
 def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
     model = SentenceTransformer(model_name)
     return embeddings
 def store_in_database(chunks, embeddings):
+    with open("embeddings.csv", "w", newline="", encoding="utf-8") as f:
         writer = csv.writer(f)
         writer.writerow(["text", "embedding"])
         for chunk, embedding in zip(chunks, embeddings):
             embeddings.append(embedding)
     return chunks, np.array(embeddings)
+def semantic_search(queryEmbedding, topK=5):
     dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
     similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
     topIndex = np.argsort(similarities)[-topK:][::-1]
 def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
     prompt = f"""
+        You are a helpful and responsible AI assistant providing professional guidance for healthcare staff.
+        The user has provided a knowledge base with relevant medical training materials.
+        Use only the retrieved context below to answer the question factually and safely.
         Context:
         {retrievedContext}
     ],
     outputs="text",
     live=False,  # Disable live updates
+    title="RAG System Web App",  # Title of the app
     description="Upload a PDF and ask a question to extract information from it.",  # Optional description
     allow_flagging="never",
 )

embeddings.csv ADDED Viewed

The diff for this file is too large to render. See raw diff