Static_H / abc
MissSqui's picture
Update abc
dda0320 verified
from sentence_transformers import SentenceTransformer, util
print("import done")
# Input chunks
retrieved_chunks = [
"The Eiffel Tower is a landmark in Paris.",
"Paris is the capital of France.",
"The Louvre is also in Paris.",
"Eiffel Tower was built in 1889.",
"It is a famous tourist spot."
]
relevant_chunks = [
"The Eiffel Tower is a landmark in Paris.",
"Eiffel Tower was built in 1889."
]
# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Compute embeddings
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
relevant_embeddings = model.encode(relevant_chunks, convert_to_tensor=True)
# Calculate pairwise cosine similarities
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
# Print similarity matrix
print("Cosine Similarity Matrix (rows: retrieved, columns: relevant):\n")
for i, retrieved in enumerate(retrieved_chunks):
for j, relevant in enumerate(relevant_chunks):
score = cosine_sim_matrix[i][j].item()
print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Relevant : \"{relevant}\"\n Score : {score:.4f}\n")------
-----------------------------------------
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
#client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
# Load models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2")
perplexity_model.eval()
# Evaluation Metrics
def bleu_rouge_score(reference, generated):
bleu = sentence_bleu([reference.split()], generated.split())
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rougeL = rouge.score(reference, generated)['rougeL'].fmeasure
return {'bleu': bleu, 'rougeL': rougeL}
def cosine_sim(reference, generated):
emb_ref = embedding_model.encode([reference])[0]
emb_gen = embedding_model.encode([generated])[0]
sim = cosine_similarity([emb_ref], [emb_gen])[0][0]
return sim
def perplexity_score(text):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = perplexity_model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
return torch.exp(loss).item()
def precision_at_k(retrieved, relevant, k):
top_k = retrieved[:k]
correct = sum(1 for item in top_k if item in relevant)
return correct / k
def recall_at_k(retrieved, relevant, k):
correct = sum(1 for item in retrieved[:k] if item in relevant)
return correct / len(relevant)
def ndcg_at_k(retrieved, relevant, k):
def dcg(items):
return sum([1 / np.log2(i+2) if items[i] in relevant else 0 for i in range(len(items))])
ideal = dcg(relevant[:k])
actual = dcg(retrieved[:k])
return actual / ideal if ideal != 0 else 0
def hit_at_k(retrieved, relevant, k):
top_k = retrieved[:k]
return int(any(item in relevant for item in top_k))
# Main Evaluation
def full_evaluation(reference, generated, retrieved, relevant_chunks):
return {
**bleu_rouge_score(reference, generated),
"cosine_similarity": cosine_sim(reference, generated),
"perplexity": perplexity_score(generated),
"precision@5": precision_at_k(retrieved, relevant_chunks, 5),
"recall@5": recall_at_k(retrieved, relevant_chunks, 5),
"ndcg@5": ndcg_at_k(retrieved, relevant_chunks, 5),
"hit@5": hit_at_k(retrieved, relevant_chunks, 5)
}
# Sample Run
if __name__ == "__main__":
reference_answer = "The Eiffel Tower is located in Paris."
generated_response = "Eiffel Tower stands in Paris."
retrieved_chunks = [
"The Eiffel Tower is a landmark in Paris.",
"Paris is the capital of France.",
"The Louvre is also in Paris.",
"Eiffel Tower was built in 1889.",
"It is a famous tourist spot."
]
relevant_chunks = [
"The Eiffel Tower is a landmark in Paris.",
"Eiffel Tower was built in 1889."
]
scores = full_evaluation(reference_answer, generated_response, retrieved_chunks, relevant_chunks)
for metric, score in scores.items():
print(f"{metric}: {score:.4f}" if isinstance(score, float) else f"{metric}: {score}")
---------------------------
# Install dependencies first (if not already installed)
# pip install openai sentence-transformers langchain pymupdf
import fitz # PyMuPDF for PDF text extraction
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import openai
# Configure OpenAI API Key
openai.api_key = "YOUR_OPENAI_API_KEY" # Replace with your actual OpenAI API key
### **Step 1: Extract Text from PDF**
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text() for page in doc])
return text
### **Step 2: Split Text Using RecursiveCharacterTextSplitter**
def split_text(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
return text_splitter.split_text(text)
### **Step 3: Compute Embeddings for Chunk Retrieval**
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2'):
model = SentenceTransformer(model_name)
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
print("Cosine Similarity Matrix (rows: retrieved, columns: relevant queries):\n")
relevant_scores = []
for i, retrieved in enumerate(retrieved_chunks):
for j, relevant in enumerate(relevant_queries):
score = cosine_sim_matrix[i][j].item()
relevant_scores.append((retrieved, relevant, score))
print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Query : \"{relevant}\"\n Score : {score:.4f}\n")
# Sort and return the top-K most relevant chunks
relevant_scores.sort(key=lambda x: x[2], reverse=True)
return [x[0] for x in relevant_scores[:5]] # Adjust top-K as needed
### **Step 4: Pass Top-K Chunks to OpenAI LLM**
def query_openai(prompt):
response = openai.ChatCompletion.create(
model="gpt-4", # You can change this to "gpt-3.5-turbo" or another model
messages=[{"role": "system", "content": "You are an assistant."},
{"role": "user", "content": prompt}]
)
return response["choices"][0]["message"]["content"]
### **Final Workflow**
def process_pdf(pdf_path):
# Step 1: Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_path)
# Step 2: Split text into chunks
retrieved_chunks = split_text(extracted_text)
# Step 3: Define relevant queries (modify as per your needs)
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
# Step 4: Retrieve top-K relevant chunks
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)
# Step 5: Query OpenAI LLM with relevant chunks
prompt = f"Summarize this information: {' '.join(top_chunks)}"
openai_response = query_openai(prompt)
print("\nOpenAI Response:\n", openai_response)
# Run the pipeline with a sample PDF file
process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") # Replace with your actual PDF file path
###############################################################################
### Step 1: Extract Text from PDF ###
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text() for page in doc])
return text
### Step 2: Split Text Using RecursiveCharacterTextSplitter ###
def split_text(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
return text_splitter.split_text(text)
### Step 3: Compute Embeddings for Chunk Retrieval ###
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4):
model = SentenceTransformer(model_name)
# Clean up empty strings
retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()]
relevant_queries = [q for q in relevant_queries if q.strip()]
# Debug check
if not retrieved_chunks:
raise ValueError("retrieved_chunks is empty!")
if not relevant_queries:
raise ValueError("relevant_queries is empty!")
# Compute embeddings
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
# Cosine similarity matrix: (retrieved x queries)
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
# Log similarity matrix
print("Cosine Similarity Matrix (rows: chunks, columns: queries):\n", cosine_sim_matrix)
# Score all pairs
relevant_scores = []
for i, retrieved in enumerate(retrieved_chunks):
for j, relevant in enumerate(relevant_queries):
score = cosine_sim_matrix[i][j].item()
relevant_scores.append((retrieved, relevant, score))
# Sort by score descending
relevant_scores.sort(key=lambda x: x[2], reverse=True)
# Log top matches
print("\nTop Relevant Chunks and Scores:")
for r, q, s in relevant_scores[:top_k]:
print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}")
# Apply threshold
filtered = [x for x in relevant_scores if x[2] >= similarity_threshold]
top_filtered = filtered[:top_k]
return [x[0] for x in top_filtered]
### Step 4: Pass Top-K Chunks to OpenAI LLM ###
def query_openai(prompt):
response = openai.ChatCompletion.create(
model="gpt-4", # Or "gpt-3.5-turbo"
messages=[
{"role": "system", "content": "You are an assistant."},
{"role": "user", "content": prompt}
]
)
return response["choices"][0]["message"]["content"]
### Final Workflow ###
def process_pdf(pdf_path):
# Step 1: Extract text
extracted_text = extract_text_from_pdf(pdf_path)
# Step 2: Split into chunks
retrieved_chunks = split_text(extracted_text)
# Step 3: Define queries
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
# Step 4: Retrieve top-K relevant chunks
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)
# Step 5: Query OpenAI
prompt = (
f"You are a historical assistant. Summarize the following content "
f"in context of the queries: {', '.join(relevant_queries)}.\n\n"
f"Relevant content:\n{'\n\n'.join(top_chunks)}"
)
openai_response = query_openai(prompt)
print("\nOpenAI Response:\n", openai_response)
### Run the pipeline on your sample PDF ###
if __name__ == "__main__":
process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf")
=======================================================================
import fitz
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import openai
# For Windows users: uncomment and set path if needed
# pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
openai.api_key = "your-api-key-here" # 🔐 Replace with your actual OpenAI API key
### Step 1: Extract Text from PDF or Fallback to OCR ###
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text() for page in doc])
if text.strip(): # If text was extracted from PDF
print("Text extracted using fitz.")
return text
# OCR fallback for scanned/image-based PDFs
print("No extractable text found. Trying OCR instead...")
images = convert_from_path(pdf_path)
ocr_text = "\n".join(pytesseract.image_to_string(image) for image in images)
return ocr_text
### Step 2: Split Text into Chunks ###
def split_text(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
return text_splitter.split_text(text)
### Step 3: Get Top-K Relevant Chunks Based on Query ###
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4):
model = SentenceTransformer(model_name)
retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()]
relevant_queries = [q for q in relevant_queries if q.strip()]
if not retrieved_chunks:
raise ValueError("retrieved_chunks is empty!")
if not relevant_queries:
raise ValueError("relevant_queries is empty!")
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
relevant_scores = []
for i, retrieved in enumerate(retrieved_chunks):
for j, relevant in enumerate(relevant_queries):
score = cosine_sim_matrix[i][j].item()
relevant_scores.append((retrieved, relevant, score))
relevant_scores.sort(key=lambda x: x[2], reverse=True)
filtered = [x for x in relevant_scores if x[2] >= similarity_threshold]
top_filtered = filtered[:top_k]
for r, q, s in top_filtered:
print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}")
return [x[0] for x in top_filtered]
### Step 4: Query OpenAI LLM with Prompt ###
def query_openai(prompt):
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are an assistant."},
{"role": "user", "content": prompt}
]
)
return response["choices"][0]["message"]["content"]
### Final Workflow ###
def process_pdf(pdf_path):
extracted_text = extract_text_from_pdf(pdf_path)
print("Extracted text length:", len(extracted_text))
retrieved_chunks = split_text(extracted_text)
print("Number of chunks created:", len(retrieved_chunks))
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)
prompt = (
f"You are a historical assistant. Summarize the following content "
f"in context of the queries: {', '.join(relevant_queries)}.\n\n"
f"Relevant content:\n{'\n\n'.join(top_chunks)}"
)
openai_response = query_openai(prompt)
print("\nOpenAI Response:\n", openai_response)
### Run the pipeline ###
if __name__ == "__main__":
process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf")
=======================================
def openai_llm_call(prompt: str, persona: str) -> str:
client = AzureOpenAI(
api_version=os.getenv("API_VERSION"),
azure_endpoint=os.getenv("ENDPOINT"),
api_key=os.getenv("OPENAI_API_KEY"),
)
response = client.chat.completions.create(
messages=[
{
"role": "system",
"content": persona,
},
{
"role": "user",
"content": prompt,
}
],
model=os.getenv("DEPLOYMENT")
)
return response.choices[0].message.content