MissSqui commited on
Commit
84f3ab4
·
verified ·
1 Parent(s): c90b64e

Update abc

Browse files
Files changed (1) hide show
  1. abc +95 -1
abc CHANGED
@@ -201,6 +201,100 @@ def process_pdf(pdf_path):
201
  # Run the pipeline with a sample PDF file
202
  process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") # Replace with your actual PDF file path
203
 
204
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
 
 
201
  # Run the pipeline with a sample PDF file
202
  process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") # Replace with your actual PDF file path
203
 
204
+ ###############################################################################
205
+ ### Step 1: Extract Text from PDF ###
206
+ def extract_text_from_pdf(pdf_path):
207
+ doc = fitz.open(pdf_path)
208
+ text = "\n".join([page.get_text() for page in doc])
209
+ return text
210
+
211
+ ### Step 2: Split Text Using RecursiveCharacterTextSplitter ###
212
+ def split_text(text):
213
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
214
+ return text_splitter.split_text(text)
215
+
216
+ ### Step 3: Compute Embeddings for Chunk Retrieval ###
217
+ def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4):
218
+ model = SentenceTransformer(model_name)
219
+
220
+ # Clean up empty strings
221
+ retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()]
222
+ relevant_queries = [q for q in relevant_queries if q.strip()]
223
+
224
+ # Debug check
225
+ if not retrieved_chunks:
226
+ raise ValueError("retrieved_chunks is empty!")
227
+ if not relevant_queries:
228
+ raise ValueError("relevant_queries is empty!")
229
+
230
+ # Compute embeddings
231
+ retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
232
+ relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
233
+
234
+ # Cosine similarity matrix: (retrieved x queries)
235
+ cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
236
+
237
+ # Log similarity matrix
238
+ print("Cosine Similarity Matrix (rows: chunks, columns: queries):\n", cosine_sim_matrix)
239
+
240
+ # Score all pairs
241
+ relevant_scores = []
242
+ for i, retrieved in enumerate(retrieved_chunks):
243
+ for j, relevant in enumerate(relevant_queries):
244
+ score = cosine_sim_matrix[i][j].item()
245
+ relevant_scores.append((retrieved, relevant, score))
246
+
247
+ # Sort by score descending
248
+ relevant_scores.sort(key=lambda x: x[2], reverse=True)
249
+
250
+ # Log top matches
251
+ print("\nTop Relevant Chunks and Scores:")
252
+ for r, q, s in relevant_scores[:top_k]:
253
+ print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}")
254
+
255
+ # Apply threshold
256
+ filtered = [x for x in relevant_scores if x[2] >= similarity_threshold]
257
+ top_filtered = filtered[:top_k]
258
+
259
+ return [x[0] for x in top_filtered]
260
+
261
+ ### Step 4: Pass Top-K Chunks to OpenAI LLM ###
262
+ def query_openai(prompt):
263
+ response = openai.ChatCompletion.create(
264
+ model="gpt-4", # Or "gpt-3.5-turbo"
265
+ messages=[
266
+ {"role": "system", "content": "You are an assistant."},
267
+ {"role": "user", "content": prompt}
268
+ ]
269
+ )
270
+ return response["choices"][0]["message"]["content"]
271
+
272
+ ### Final Workflow ###
273
+ def process_pdf(pdf_path):
274
+ # Step 1: Extract text
275
+ extracted_text = extract_text_from_pdf(pdf_path)
276
+
277
+ # Step 2: Split into chunks
278
+ retrieved_chunks = split_text(extracted_text)
279
+
280
+ # Step 3: Define queries
281
+ relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
282
+
283
+ # Step 4: Retrieve top-K relevant chunks
284
+ top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)
285
+
286
+ # Step 5: Query OpenAI
287
+ prompt = (
288
+ f"You are a historical assistant. Summarize the following content "
289
+ f"in context of the queries: {', '.join(relevant_queries)}.\n\n"
290
+ f"Relevant content:\n{'\n\n'.join(top_chunks)}"
291
+ )
292
+ openai_response = query_openai(prompt)
293
+
294
+ print("\nOpenAI Response:\n", openai_response)
295
+
296
+ ### Run the pipeline on your sample PDF ###
297
+ if __name__ == "__main__":
298
+ process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf")
299
 
300