File size: 15,749 Bytes
58fe6ee 8dfc466 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 2243a4b c90b64e 4710e39 84f3ab4 4710e39 c90b64e 0be4a76 dda0320 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 |
from sentence_transformers import SentenceTransformer, util
print("import done")
# Input chunks
retrieved_chunks = [
"The Eiffel Tower is a landmark in Paris.",
"Paris is the capital of France.",
"The Louvre is also in Paris.",
"Eiffel Tower was built in 1889.",
"It is a famous tourist spot."
]
relevant_chunks = [
"The Eiffel Tower is a landmark in Paris.",
"Eiffel Tower was built in 1889."
]
# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Compute embeddings
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
relevant_embeddings = model.encode(relevant_chunks, convert_to_tensor=True)
# Calculate pairwise cosine similarities
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
# Print similarity matrix
print("Cosine Similarity Matrix (rows: retrieved, columns: relevant):\n")
for i, retrieved in enumerate(retrieved_chunks):
for j, relevant in enumerate(relevant_chunks):
score = cosine_sim_matrix[i][j].item()
print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Relevant : \"{relevant}\"\n Score : {score:.4f}\n")------
-----------------------------------------
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
#client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
# Load models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2")
perplexity_model.eval()
# Evaluation Metrics
def bleu_rouge_score(reference, generated):
bleu = sentence_bleu([reference.split()], generated.split())
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rougeL = rouge.score(reference, generated)['rougeL'].fmeasure
return {'bleu': bleu, 'rougeL': rougeL}
def cosine_sim(reference, generated):
emb_ref = embedding_model.encode([reference])[0]
emb_gen = embedding_model.encode([generated])[0]
sim = cosine_similarity([emb_ref], [emb_gen])[0][0]
return sim
def perplexity_score(text):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = perplexity_model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
return torch.exp(loss).item()
def precision_at_k(retrieved, relevant, k):
top_k = retrieved[:k]
correct = sum(1 for item in top_k if item in relevant)
return correct / k
def recall_at_k(retrieved, relevant, k):
correct = sum(1 for item in retrieved[:k] if item in relevant)
return correct / len(relevant)
def ndcg_at_k(retrieved, relevant, k):
def dcg(items):
return sum([1 / np.log2(i+2) if items[i] in relevant else 0 for i in range(len(items))])
ideal = dcg(relevant[:k])
actual = dcg(retrieved[:k])
return actual / ideal if ideal != 0 else 0
def hit_at_k(retrieved, relevant, k):
top_k = retrieved[:k]
return int(any(item in relevant for item in top_k))
# Main Evaluation
def full_evaluation(reference, generated, retrieved, relevant_chunks):
return {
**bleu_rouge_score(reference, generated),
"cosine_similarity": cosine_sim(reference, generated),
"perplexity": perplexity_score(generated),
"precision@5": precision_at_k(retrieved, relevant_chunks, 5),
"recall@5": recall_at_k(retrieved, relevant_chunks, 5),
"ndcg@5": ndcg_at_k(retrieved, relevant_chunks, 5),
"hit@5": hit_at_k(retrieved, relevant_chunks, 5)
}
# Sample Run
if __name__ == "__main__":
reference_answer = "The Eiffel Tower is located in Paris."
generated_response = "Eiffel Tower stands in Paris."
retrieved_chunks = [
"The Eiffel Tower is a landmark in Paris.",
"Paris is the capital of France.",
"The Louvre is also in Paris.",
"Eiffel Tower was built in 1889.",
"It is a famous tourist spot."
]
relevant_chunks = [
"The Eiffel Tower is a landmark in Paris.",
"Eiffel Tower was built in 1889."
]
scores = full_evaluation(reference_answer, generated_response, retrieved_chunks, relevant_chunks)
for metric, score in scores.items():
print(f"{metric}: {score:.4f}" if isinstance(score, float) else f"{metric}: {score}")
---------------------------
# Install dependencies first (if not already installed)
# pip install openai sentence-transformers langchain pymupdf
import fitz # PyMuPDF for PDF text extraction
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import openai
# Configure OpenAI API Key
openai.api_key = "YOUR_OPENAI_API_KEY" # Replace with your actual OpenAI API key
### **Step 1: Extract Text from PDF**
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text() for page in doc])
return text
### **Step 2: Split Text Using RecursiveCharacterTextSplitter**
def split_text(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
return text_splitter.split_text(text)
### **Step 3: Compute Embeddings for Chunk Retrieval**
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2'):
model = SentenceTransformer(model_name)
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
print("Cosine Similarity Matrix (rows: retrieved, columns: relevant queries):\n")
relevant_scores = []
for i, retrieved in enumerate(retrieved_chunks):
for j, relevant in enumerate(relevant_queries):
score = cosine_sim_matrix[i][j].item()
relevant_scores.append((retrieved, relevant, score))
print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Query : \"{relevant}\"\n Score : {score:.4f}\n")
# Sort and return the top-K most relevant chunks
relevant_scores.sort(key=lambda x: x[2], reverse=True)
return [x[0] for x in relevant_scores[:5]] # Adjust top-K as needed
### **Step 4: Pass Top-K Chunks to OpenAI LLM**
def query_openai(prompt):
response = openai.ChatCompletion.create(
model="gpt-4", # You can change this to "gpt-3.5-turbo" or another model
messages=[{"role": "system", "content": "You are an assistant."},
{"role": "user", "content": prompt}]
)
return response["choices"][0]["message"]["content"]
### **Final Workflow**
def process_pdf(pdf_path):
# Step 1: Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_path)
# Step 2: Split text into chunks
retrieved_chunks = split_text(extracted_text)
# Step 3: Define relevant queries (modify as per your needs)
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
# Step 4: Retrieve top-K relevant chunks
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)
# Step 5: Query OpenAI LLM with relevant chunks
prompt = f"Summarize this information: {' '.join(top_chunks)}"
openai_response = query_openai(prompt)
print("\nOpenAI Response:\n", openai_response)
# Run the pipeline with a sample PDF file
process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") # Replace with your actual PDF file path
###############################################################################
### Step 1: Extract Text from PDF ###
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text() for page in doc])
return text
### Step 2: Split Text Using RecursiveCharacterTextSplitter ###
def split_text(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
return text_splitter.split_text(text)
### Step 3: Compute Embeddings for Chunk Retrieval ###
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4):
model = SentenceTransformer(model_name)
# Clean up empty strings
retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()]
relevant_queries = [q for q in relevant_queries if q.strip()]
# Debug check
if not retrieved_chunks:
raise ValueError("retrieved_chunks is empty!")
if not relevant_queries:
raise ValueError("relevant_queries is empty!")
# Compute embeddings
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
# Cosine similarity matrix: (retrieved x queries)
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
# Log similarity matrix
print("Cosine Similarity Matrix (rows: chunks, columns: queries):\n", cosine_sim_matrix)
# Score all pairs
relevant_scores = []
for i, retrieved in enumerate(retrieved_chunks):
for j, relevant in enumerate(relevant_queries):
score = cosine_sim_matrix[i][j].item()
relevant_scores.append((retrieved, relevant, score))
# Sort by score descending
relevant_scores.sort(key=lambda x: x[2], reverse=True)
# Log top matches
print("\nTop Relevant Chunks and Scores:")
for r, q, s in relevant_scores[:top_k]:
print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}")
# Apply threshold
filtered = [x for x in relevant_scores if x[2] >= similarity_threshold]
top_filtered = filtered[:top_k]
return [x[0] for x in top_filtered]
### Step 4: Pass Top-K Chunks to OpenAI LLM ###
def query_openai(prompt):
response = openai.ChatCompletion.create(
model="gpt-4", # Or "gpt-3.5-turbo"
messages=[
{"role": "system", "content": "You are an assistant."},
{"role": "user", "content": prompt}
]
)
return response["choices"][0]["message"]["content"]
### Final Workflow ###
def process_pdf(pdf_path):
# Step 1: Extract text
extracted_text = extract_text_from_pdf(pdf_path)
# Step 2: Split into chunks
retrieved_chunks = split_text(extracted_text)
# Step 3: Define queries
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
# Step 4: Retrieve top-K relevant chunks
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)
# Step 5: Query OpenAI
prompt = (
f"You are a historical assistant. Summarize the following content "
f"in context of the queries: {', '.join(relevant_queries)}.\n\n"
f"Relevant content:\n{'\n\n'.join(top_chunks)}"
)
openai_response = query_openai(prompt)
print("\nOpenAI Response:\n", openai_response)
### Run the pipeline on your sample PDF ###
if __name__ == "__main__":
process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf")
=======================================================================
import fitz
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import openai
# For Windows users: uncomment and set path if needed
# pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
openai.api_key = "your-api-key-here" # 🔐 Replace with your actual OpenAI API key
### Step 1: Extract Text from PDF or Fallback to OCR ###
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text() for page in doc])
if text.strip(): # If text was extracted from PDF
print("Text extracted using fitz.")
return text
# OCR fallback for scanned/image-based PDFs
print("No extractable text found. Trying OCR instead...")
images = convert_from_path(pdf_path)
ocr_text = "\n".join(pytesseract.image_to_string(image) for image in images)
return ocr_text
### Step 2: Split Text into Chunks ###
def split_text(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
return text_splitter.split_text(text)
### Step 3: Get Top-K Relevant Chunks Based on Query ###
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4):
model = SentenceTransformer(model_name)
retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()]
relevant_queries = [q for q in relevant_queries if q.strip()]
if not retrieved_chunks:
raise ValueError("retrieved_chunks is empty!")
if not relevant_queries:
raise ValueError("relevant_queries is empty!")
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
relevant_scores = []
for i, retrieved in enumerate(retrieved_chunks):
for j, relevant in enumerate(relevant_queries):
score = cosine_sim_matrix[i][j].item()
relevant_scores.append((retrieved, relevant, score))
relevant_scores.sort(key=lambda x: x[2], reverse=True)
filtered = [x for x in relevant_scores if x[2] >= similarity_threshold]
top_filtered = filtered[:top_k]
for r, q, s in top_filtered:
print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}")
return [x[0] for x in top_filtered]
### Step 4: Query OpenAI LLM with Prompt ###
def query_openai(prompt):
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are an assistant."},
{"role": "user", "content": prompt}
]
)
return response["choices"][0]["message"]["content"]
### Final Workflow ###
def process_pdf(pdf_path):
extracted_text = extract_text_from_pdf(pdf_path)
print("Extracted text length:", len(extracted_text))
retrieved_chunks = split_text(extracted_text)
print("Number of chunks created:", len(retrieved_chunks))
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)
prompt = (
f"You are a historical assistant. Summarize the following content "
f"in context of the queries: {', '.join(relevant_queries)}.\n\n"
f"Relevant content:\n{'\n\n'.join(top_chunks)}"
)
openai_response = query_openai(prompt)
print("\nOpenAI Response:\n", openai_response)
### Run the pipeline ###
if __name__ == "__main__":
process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf")
=======================================
def openai_llm_call(prompt: str, persona: str) -> str:
client = AzureOpenAI(
api_version=os.getenv("API_VERSION"),
azure_endpoint=os.getenv("ENDPOINT"),
api_key=os.getenv("OPENAI_API_KEY"),
)
response = client.chat.completions.create(
messages=[
{
"role": "system",
"content": persona,
},
{
"role": "user",
"content": prompt,
}
],
model=os.getenv("DEPLOYMENT")
)
return response.choices[0].message.content
|