|
|
from sentence_transformers import SentenceTransformer, util |
|
|
print("import done") |
|
|
|
|
|
retrieved_chunks = [ |
|
|
"The Eiffel Tower is a landmark in Paris.", |
|
|
"Paris is the capital of France.", |
|
|
"The Louvre is also in Paris.", |
|
|
"Eiffel Tower was built in 1889.", |
|
|
"It is a famous tourist spot." |
|
|
] |
|
|
|
|
|
relevant_chunks = [ |
|
|
"The Eiffel Tower is a landmark in Paris.", |
|
|
"Eiffel Tower was built in 1889." |
|
|
] |
|
|
|
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
|
|
|
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
|
|
relevant_embeddings = model.encode(relevant_chunks, convert_to_tensor=True) |
|
|
|
|
|
|
|
|
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
|
|
|
|
|
|
|
|
print("Cosine Similarity Matrix (rows: retrieved, columns: relevant):\n") |
|
|
for i, retrieved in enumerate(retrieved_chunks): |
|
|
for j, relevant in enumerate(relevant_chunks): |
|
|
score = cosine_sim_matrix[i][j].item() |
|
|
print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Relevant : \"{relevant}\"\n Score : {score:.4f}\n")------ |
|
|
----------------------------------------- |
|
|
|
|
|
|
|
|
import numpy as np |
|
|
from nltk.translate.bleu_score import sentence_bleu |
|
|
from rouge_score import rouge_scorer |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel |
|
|
import torch |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
|
|
perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2") |
|
|
perplexity_model.eval() |
|
|
|
|
|
|
|
|
def bleu_rouge_score(reference, generated): |
|
|
bleu = sentence_bleu([reference.split()], generated.split()) |
|
|
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) |
|
|
rougeL = rouge.score(reference, generated)['rougeL'].fmeasure |
|
|
return {'bleu': bleu, 'rougeL': rougeL} |
|
|
|
|
|
def cosine_sim(reference, generated): |
|
|
emb_ref = embedding_model.encode([reference])[0] |
|
|
emb_gen = embedding_model.encode([generated])[0] |
|
|
sim = cosine_similarity([emb_ref], [emb_gen])[0][0] |
|
|
return sim |
|
|
|
|
|
def perplexity_score(text): |
|
|
inputs = tokenizer(text, return_tensors="pt") |
|
|
with torch.no_grad(): |
|
|
outputs = perplexity_model(**inputs, labels=inputs["input_ids"]) |
|
|
loss = outputs.loss |
|
|
return torch.exp(loss).item() |
|
|
|
|
|
def precision_at_k(retrieved, relevant, k): |
|
|
top_k = retrieved[:k] |
|
|
correct = sum(1 for item in top_k if item in relevant) |
|
|
return correct / k |
|
|
|
|
|
def recall_at_k(retrieved, relevant, k): |
|
|
correct = sum(1 for item in retrieved[:k] if item in relevant) |
|
|
return correct / len(relevant) |
|
|
|
|
|
def ndcg_at_k(retrieved, relevant, k): |
|
|
def dcg(items): |
|
|
return sum([1 / np.log2(i+2) if items[i] in relevant else 0 for i in range(len(items))]) |
|
|
ideal = dcg(relevant[:k]) |
|
|
actual = dcg(retrieved[:k]) |
|
|
return actual / ideal if ideal != 0 else 0 |
|
|
|
|
|
def hit_at_k(retrieved, relevant, k): |
|
|
top_k = retrieved[:k] |
|
|
return int(any(item in relevant for item in top_k)) |
|
|
|
|
|
|
|
|
def full_evaluation(reference, generated, retrieved, relevant_chunks): |
|
|
return { |
|
|
**bleu_rouge_score(reference, generated), |
|
|
"cosine_similarity": cosine_sim(reference, generated), |
|
|
"perplexity": perplexity_score(generated), |
|
|
"precision@5": precision_at_k(retrieved, relevant_chunks, 5), |
|
|
"recall@5": recall_at_k(retrieved, relevant_chunks, 5), |
|
|
"ndcg@5": ndcg_at_k(retrieved, relevant_chunks, 5), |
|
|
"hit@5": hit_at_k(retrieved, relevant_chunks, 5) |
|
|
} |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
reference_answer = "The Eiffel Tower is located in Paris." |
|
|
generated_response = "Eiffel Tower stands in Paris." |
|
|
|
|
|
retrieved_chunks = [ |
|
|
"The Eiffel Tower is a landmark in Paris.", |
|
|
"Paris is the capital of France.", |
|
|
"The Louvre is also in Paris.", |
|
|
"Eiffel Tower was built in 1889.", |
|
|
"It is a famous tourist spot." |
|
|
] |
|
|
|
|
|
relevant_chunks = [ |
|
|
"The Eiffel Tower is a landmark in Paris.", |
|
|
"Eiffel Tower was built in 1889." |
|
|
] |
|
|
|
|
|
scores = full_evaluation(reference_answer, generated_response, retrieved_chunks, relevant_chunks) |
|
|
|
|
|
for metric, score in scores.items(): |
|
|
print(f"{metric}: {score:.4f}" if isinstance(score, float) else f"{metric}: {score}") |
|
|
|
|
|
--------------------------- |
|
|
|
|
|
|
|
|
|
|
|
import fitz |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
import openai |
|
|
|
|
|
|
|
|
openai.api_key = "YOUR_OPENAI_API_KEY" |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
|
doc = fitz.open(pdf_path) |
|
|
text = "\n".join([page.get_text() for page in doc]) |
|
|
return text |
|
|
|
|
|
|
|
|
def split_text(text): |
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
|
|
return text_splitter.split_text(text) |
|
|
|
|
|
|
|
|
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2'): |
|
|
model = SentenceTransformer(model_name) |
|
|
|
|
|
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
|
|
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) |
|
|
|
|
|
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
|
|
|
|
|
print("Cosine Similarity Matrix (rows: retrieved, columns: relevant queries):\n") |
|
|
relevant_scores = [] |
|
|
|
|
|
for i, retrieved in enumerate(retrieved_chunks): |
|
|
for j, relevant in enumerate(relevant_queries): |
|
|
score = cosine_sim_matrix[i][j].item() |
|
|
relevant_scores.append((retrieved, relevant, score)) |
|
|
print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Query : \"{relevant}\"\n Score : {score:.4f}\n") |
|
|
|
|
|
|
|
|
relevant_scores.sort(key=lambda x: x[2], reverse=True) |
|
|
return [x[0] for x in relevant_scores[:5]] |
|
|
|
|
|
|
|
|
def query_openai(prompt): |
|
|
response = openai.ChatCompletion.create( |
|
|
model="gpt-4", |
|
|
messages=[{"role": "system", "content": "You are an assistant."}, |
|
|
{"role": "user", "content": prompt}] |
|
|
) |
|
|
return response["choices"][0]["message"]["content"] |
|
|
|
|
|
|
|
|
def process_pdf(pdf_path): |
|
|
|
|
|
extracted_text = extract_text_from_pdf(pdf_path) |
|
|
|
|
|
|
|
|
retrieved_chunks = split_text(extracted_text) |
|
|
|
|
|
|
|
|
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] |
|
|
|
|
|
|
|
|
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) |
|
|
|
|
|
|
|
|
prompt = f"Summarize this information: {' '.join(top_chunks)}" |
|
|
openai_response = query_openai(prompt) |
|
|
|
|
|
print("\nOpenAI Response:\n", openai_response) |
|
|
|
|
|
|
|
|
process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") |
|
|
|
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
|
doc = fitz.open(pdf_path) |
|
|
text = "\n".join([page.get_text() for page in doc]) |
|
|
return text |
|
|
|
|
|
|
|
|
def split_text(text): |
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
|
|
return text_splitter.split_text(text) |
|
|
|
|
|
|
|
|
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4): |
|
|
model = SentenceTransformer(model_name) |
|
|
|
|
|
|
|
|
retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()] |
|
|
relevant_queries = [q for q in relevant_queries if q.strip()] |
|
|
|
|
|
|
|
|
if not retrieved_chunks: |
|
|
raise ValueError("retrieved_chunks is empty!") |
|
|
if not relevant_queries: |
|
|
raise ValueError("relevant_queries is empty!") |
|
|
|
|
|
|
|
|
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
|
|
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) |
|
|
|
|
|
|
|
|
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
|
|
|
|
|
|
|
|
print("Cosine Similarity Matrix (rows: chunks, columns: queries):\n", cosine_sim_matrix) |
|
|
|
|
|
|
|
|
relevant_scores = [] |
|
|
for i, retrieved in enumerate(retrieved_chunks): |
|
|
for j, relevant in enumerate(relevant_queries): |
|
|
score = cosine_sim_matrix[i][j].item() |
|
|
relevant_scores.append((retrieved, relevant, score)) |
|
|
|
|
|
|
|
|
relevant_scores.sort(key=lambda x: x[2], reverse=True) |
|
|
|
|
|
|
|
|
print("\nTop Relevant Chunks and Scores:") |
|
|
for r, q, s in relevant_scores[:top_k]: |
|
|
print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}") |
|
|
|
|
|
|
|
|
filtered = [x for x in relevant_scores if x[2] >= similarity_threshold] |
|
|
top_filtered = filtered[:top_k] |
|
|
|
|
|
return [x[0] for x in top_filtered] |
|
|
|
|
|
|
|
|
def query_openai(prompt): |
|
|
response = openai.ChatCompletion.create( |
|
|
model="gpt-4", |
|
|
messages=[ |
|
|
{"role": "system", "content": "You are an assistant."}, |
|
|
{"role": "user", "content": prompt} |
|
|
] |
|
|
) |
|
|
return response["choices"][0]["message"]["content"] |
|
|
|
|
|
|
|
|
def process_pdf(pdf_path): |
|
|
|
|
|
extracted_text = extract_text_from_pdf(pdf_path) |
|
|
|
|
|
|
|
|
retrieved_chunks = split_text(extracted_text) |
|
|
|
|
|
|
|
|
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] |
|
|
|
|
|
|
|
|
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) |
|
|
|
|
|
|
|
|
prompt = ( |
|
|
f"You are a historical assistant. Summarize the following content " |
|
|
f"in context of the queries: {', '.join(relevant_queries)}.\n\n" |
|
|
f"Relevant content:\n{'\n\n'.join(top_chunks)}" |
|
|
) |
|
|
openai_response = query_openai(prompt) |
|
|
|
|
|
print("\nOpenAI Response:\n", openai_response) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf") |
|
|
|
|
|
|
|
|
|
|
|
======================================================================= |
|
|
|
|
|
import fitz |
|
|
import pytesseract |
|
|
from pdf2image import convert_from_path |
|
|
from PIL import Image |
|
|
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
import openai |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
openai.api_key = "your-api-key-here" |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
|
doc = fitz.open(pdf_path) |
|
|
text = "\n".join([page.get_text() for page in doc]) |
|
|
|
|
|
if text.strip(): |
|
|
print("Text extracted using fitz.") |
|
|
return text |
|
|
|
|
|
|
|
|
print("No extractable text found. Trying OCR instead...") |
|
|
images = convert_from_path(pdf_path) |
|
|
ocr_text = "\n".join(pytesseract.image_to_string(image) for image in images) |
|
|
return ocr_text |
|
|
|
|
|
|
|
|
def split_text(text): |
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30) |
|
|
return text_splitter.split_text(text) |
|
|
|
|
|
|
|
|
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4): |
|
|
model = SentenceTransformer(model_name) |
|
|
retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()] |
|
|
relevant_queries = [q for q in relevant_queries if q.strip()] |
|
|
|
|
|
if not retrieved_chunks: |
|
|
raise ValueError("retrieved_chunks is empty!") |
|
|
if not relevant_queries: |
|
|
raise ValueError("relevant_queries is empty!") |
|
|
|
|
|
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
|
|
relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) |
|
|
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
|
|
|
|
|
relevant_scores = [] |
|
|
for i, retrieved in enumerate(retrieved_chunks): |
|
|
for j, relevant in enumerate(relevant_queries): |
|
|
score = cosine_sim_matrix[i][j].item() |
|
|
relevant_scores.append((retrieved, relevant, score)) |
|
|
|
|
|
relevant_scores.sort(key=lambda x: x[2], reverse=True) |
|
|
filtered = [x for x in relevant_scores if x[2] >= similarity_threshold] |
|
|
top_filtered = filtered[:top_k] |
|
|
|
|
|
for r, q, s in top_filtered: |
|
|
print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}") |
|
|
|
|
|
return [x[0] for x in top_filtered] |
|
|
|
|
|
|
|
|
def query_openai(prompt): |
|
|
response = openai.ChatCompletion.create( |
|
|
model="gpt-4", |
|
|
messages=[ |
|
|
{"role": "system", "content": "You are an assistant."}, |
|
|
{"role": "user", "content": prompt} |
|
|
] |
|
|
) |
|
|
return response["choices"][0]["message"]["content"] |
|
|
|
|
|
|
|
|
def process_pdf(pdf_path): |
|
|
extracted_text = extract_text_from_pdf(pdf_path) |
|
|
print("Extracted text length:", len(extracted_text)) |
|
|
|
|
|
retrieved_chunks = split_text(extracted_text) |
|
|
print("Number of chunks created:", len(retrieved_chunks)) |
|
|
|
|
|
relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] |
|
|
|
|
|
top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) |
|
|
|
|
|
prompt = ( |
|
|
f"You are a historical assistant. Summarize the following content " |
|
|
f"in context of the queries: {', '.join(relevant_queries)}.\n\n" |
|
|
f"Relevant content:\n{'\n\n'.join(top_chunks)}" |
|
|
) |
|
|
|
|
|
openai_response = query_openai(prompt) |
|
|
print("\nOpenAI Response:\n", openai_response) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf") |
|
|
|
|
|
|
|
|
======================================= |
|
|
def openai_llm_call(prompt: str, persona: str) -> str: |
|
|
|
|
|
client = AzureOpenAI( |
|
|
api_version=os.getenv("API_VERSION"), |
|
|
azure_endpoint=os.getenv("ENDPOINT"), |
|
|
api_key=os.getenv("OPENAI_API_KEY"), |
|
|
) |
|
|
response = client.chat.completions.create( |
|
|
messages=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": persona, |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": prompt, |
|
|
} |
|
|
], |
|
|
model=os.getenv("DEPLOYMENT") |
|
|
) |
|
|
|
|
|
return response.choices[0].message.content |
|
|
|