Update abc
Browse files
abc
CHANGED
|
@@ -201,6 +201,100 @@ def process_pdf(pdf_path):
|
|
| 201 |
# Run the pipeline with a sample PDF file
|
| 202 |
process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") # Replace with your actual PDF file path
|
| 203 |
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
|
|
|
|
| 201 |
# Run the pipeline with a sample PDF file
|
| 202 |
process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") # Replace with your actual PDF file path
|
| 203 |
|
| 204 |
+
###############################################################################
|
| 205 |
+
### Step 1: Extract Text from PDF ###
|
| 206 |
+
def extract_text_from_pdf(pdf_path):
|
| 207 |
+
doc = fitz.open(pdf_path)
|
| 208 |
+
text = "\n".join([page.get_text() for page in doc])
|
| 209 |
+
return text
|
| 210 |
+
|
| 211 |
+
### Step 2: Split Text Using RecursiveCharacterTextSplitter ###
|
| 212 |
+
def split_text(text):
|
| 213 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 214 |
+
return text_splitter.split_text(text)
|
| 215 |
+
|
| 216 |
+
### Step 3: Compute Embeddings for Chunk Retrieval ###
|
| 217 |
+
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4):
|
| 218 |
+
model = SentenceTransformer(model_name)
|
| 219 |
+
|
| 220 |
+
# Clean up empty strings
|
| 221 |
+
retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()]
|
| 222 |
+
relevant_queries = [q for q in relevant_queries if q.strip()]
|
| 223 |
+
|
| 224 |
+
# Debug check
|
| 225 |
+
if not retrieved_chunks:
|
| 226 |
+
raise ValueError("retrieved_chunks is empty!")
|
| 227 |
+
if not relevant_queries:
|
| 228 |
+
raise ValueError("relevant_queries is empty!")
|
| 229 |
+
|
| 230 |
+
# Compute embeddings
|
| 231 |
+
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
|
| 232 |
+
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
|
| 233 |
+
|
| 234 |
+
# Cosine similarity matrix: (retrieved x queries)
|
| 235 |
+
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
|
| 236 |
+
|
| 237 |
+
# Log similarity matrix
|
| 238 |
+
print("Cosine Similarity Matrix (rows: chunks, columns: queries):\n", cosine_sim_matrix)
|
| 239 |
+
|
| 240 |
+
# Score all pairs
|
| 241 |
+
relevant_scores = []
|
| 242 |
+
for i, retrieved in enumerate(retrieved_chunks):
|
| 243 |
+
for j, relevant in enumerate(relevant_queries):
|
| 244 |
+
score = cosine_sim_matrix[i][j].item()
|
| 245 |
+
relevant_scores.append((retrieved, relevant, score))
|
| 246 |
+
|
| 247 |
+
# Sort by score descending
|
| 248 |
+
relevant_scores.sort(key=lambda x: x[2], reverse=True)
|
| 249 |
+
|
| 250 |
+
# Log top matches
|
| 251 |
+
print("\nTop Relevant Chunks and Scores:")
|
| 252 |
+
for r, q, s in relevant_scores[:top_k]:
|
| 253 |
+
print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}")
|
| 254 |
+
|
| 255 |
+
# Apply threshold
|
| 256 |
+
filtered = [x for x in relevant_scores if x[2] >= similarity_threshold]
|
| 257 |
+
top_filtered = filtered[:top_k]
|
| 258 |
+
|
| 259 |
+
return [x[0] for x in top_filtered]
|
| 260 |
+
|
| 261 |
+
### Step 4: Pass Top-K Chunks to OpenAI LLM ###
|
| 262 |
+
def query_openai(prompt):
|
| 263 |
+
response = openai.ChatCompletion.create(
|
| 264 |
+
model="gpt-4", # Or "gpt-3.5-turbo"
|
| 265 |
+
messages=[
|
| 266 |
+
{"role": "system", "content": "You are an assistant."},
|
| 267 |
+
{"role": "user", "content": prompt}
|
| 268 |
+
]
|
| 269 |
+
)
|
| 270 |
+
return response["choices"][0]["message"]["content"]
|
| 271 |
+
|
| 272 |
+
### Final Workflow ###
|
| 273 |
+
def process_pdf(pdf_path):
|
| 274 |
+
# Step 1: Extract text
|
| 275 |
+
extracted_text = extract_text_from_pdf(pdf_path)
|
| 276 |
+
|
| 277 |
+
# Step 2: Split into chunks
|
| 278 |
+
retrieved_chunks = split_text(extracted_text)
|
| 279 |
+
|
| 280 |
+
# Step 3: Define queries
|
| 281 |
+
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
|
| 282 |
+
|
| 283 |
+
# Step 4: Retrieve top-K relevant chunks
|
| 284 |
+
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)
|
| 285 |
+
|
| 286 |
+
# Step 5: Query OpenAI
|
| 287 |
+
prompt = (
|
| 288 |
+
f"You are a historical assistant. Summarize the following content "
|
| 289 |
+
f"in context of the queries: {', '.join(relevant_queries)}.\n\n"
|
| 290 |
+
f"Relevant content:\n{'\n\n'.join(top_chunks)}"
|
| 291 |
+
)
|
| 292 |
+
openai_response = query_openai(prompt)
|
| 293 |
+
|
| 294 |
+
print("\nOpenAI Response:\n", openai_response)
|
| 295 |
+
|
| 296 |
+
### Run the pipeline on your sample PDF ###
|
| 297 |
+
if __name__ == "__main__":
|
| 298 |
+
process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf")
|
| 299 |
|
| 300 |
|