iaravagni commited on
Commit
6c1417f
·
1 Parent(s): 3fc053b

chunk size modification

Browse files
Files changed (2) hide show
  1. app.py +25 -9
  2. embeddings.csv +0 -0
app.py CHANGED
@@ -23,11 +23,22 @@ def clean_text(text):
23
  text = text.replace(r"\'", "'")
24
  return text
25
 
26
- def chunk_text(text):
27
- clean = clean_text(text)
28
- paragraphs = re.split(r'\n', clean)
29
- paragraphs = [p.strip() for p in paragraphs if p.strip()]
30
- return paragraphs
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
33
  model = SentenceTransformer(model_name)
@@ -35,7 +46,7 @@ def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
35
  return embeddings
36
 
37
  def store_in_database(chunks, embeddings):
38
- with open("embeddings.csv", "w", newline="") as f:
39
  writer = csv.writer(f)
40
  writer.writerow(["text", "embedding"])
41
  for chunk, embedding in zip(chunks, embeddings):
@@ -63,7 +74,7 @@ def load_from_database(filepath):
63
  embeddings.append(embedding)
64
  return chunks, np.array(embeddings)
65
 
66
- def semantic_search(queryEmbedding, topK=3):
67
  dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
68
  similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
69
  topIndex = np.argsort(similarities)[-topK:][::-1]
@@ -72,7 +83,12 @@ def semantic_search(queryEmbedding, topK=3):
72
 
73
  def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
74
  prompt = f"""
75
- You are an AI assistant answering a user's query based on retrieved knowledge.
 
 
 
 
 
76
 
77
  Context:
78
  {retrievedContext}
@@ -108,7 +124,7 @@ iface = gr.Interface(
108
  ],
109
  outputs="text",
110
  live=False, # Disable live updates
111
- title="RAG App system", # Title of the app
112
  description="Upload a PDF and ask a question to extract information from it.", # Optional description
113
  allow_flagging="never",
114
  )
 
23
  text = text.replace(r"\'", "'")
24
  return text
25
 
26
+
27
+ def chunk_text(text, chunk_size=500, overlap=100):
28
+
29
+ clean = clean_text(text) # Ensure text is preprocessed
30
+ words = clean.split() # Split by words to avoid breaking mid-word
31
+
32
+ chunks = []
33
+ start = 0 # Start index for chunking
34
+
35
+ while start < len(words):
36
+ end = start + chunk_size # Define chunk endpoint
37
+ chunk = " ".join(words[start:end]) # Get words within the chunk
38
+ chunks.append(chunk.strip()) # Strip extra spaces
39
+ start += chunk_size - overlap # Move start forward with overlap
40
+
41
+ return chunks
42
 
43
  def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
44
  model = SentenceTransformer(model_name)
 
46
  return embeddings
47
 
48
  def store_in_database(chunks, embeddings):
49
+ with open("embeddings.csv", "w", newline="", encoding="utf-8") as f:
50
  writer = csv.writer(f)
51
  writer.writerow(["text", "embedding"])
52
  for chunk, embedding in zip(chunks, embeddings):
 
74
  embeddings.append(embedding)
75
  return chunks, np.array(embeddings)
76
 
77
+ def semantic_search(queryEmbedding, topK=5):
78
  dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
79
  similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
80
  topIndex = np.argsort(similarities)[-topK:][::-1]
 
83
 
84
  def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
85
  prompt = f"""
86
+ You are a helpful and responsible AI assistant providing professional guidance for healthcare staff.
87
+
88
+ The user has provided a knowledge base with relevant medical training materials.
89
+
90
+ Use only the retrieved context below to answer the question factually and safely.
91
+
92
 
93
  Context:
94
  {retrievedContext}
 
124
  ],
125
  outputs="text",
126
  live=False, # Disable live updates
127
+ title="RAG System Web App", # Title of the app
128
  description="Upload a PDF and ask a question to extract information from it.", # Optional description
129
  allow_flagging="never",
130
  )
embeddings.csv ADDED
The diff for this file is too large to render. See raw diff