| from sentence_transformers import SentenceTransformer, util |
| print("import done") |
| |
| retrieved_chunks = [ |
| "The Eiffel Tower is a landmark in Paris.", |
| "Paris is the capital of France.", |
| "The Louvre is also in Paris.", |
| "Eiffel Tower was built in 1889.", |
| "It is a famous tourist spot." |
| ] |
|
|
| relevant_chunks = [ |
| "The Eiffel Tower is a landmark in Paris.", |
| "Eiffel Tower was built in 1889." |
| ] |
|
|
| |
| model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
| |
| retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
| relevant_embeddings = model.encode(relevant_chunks, convert_to_tensor=True) |
|
|
| |
| cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
|
|
| |
| print("Cosine Similarity Matrix (rows: retrieved, columns: relevant):\n") |
| for i, retrieved in enumerate(retrieved_chunks): |
| for j, relevant in enumerate(relevant_chunks): |
| score = cosine_sim_matrix[i][j].item() |
| print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Relevant : \"{relevant}\"\n Score : {score:.4f}\n")------ |
| ----------------------------------------- |
|
|
|
|
| import numpy as np |
| from nltk.translate.bleu_score import sentence_bleu |
| from rouge_score import rouge_scorer |
| from sentence_transformers import SentenceTransformer |
| from sklearn.metrics.pairwise import cosine_similarity |
| from transformers import GPT2Tokenizer, GPT2LMHeadModel |
| import torch |
|
|
| |
|
|
| |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
| perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2") |
| perplexity_model.eval() |
|
|
| |
| def bleu_rouge_score(reference, generated): |
| bleu = sentence_bleu([reference.split()], generated.split()) |
| rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) |
| rougeL = rouge.score(reference, generated)['rougeL'].fmeasure |
| return {'bleu': bleu, 'rougeL': rougeL} |
|
|
| def cosine_sim(reference, generated): |
| emb_ref = embedding_model.encode([reference])[0] |
| emb_gen = embedding_model.encode([generated])[0] |
| sim = cosine_similarity([emb_ref], [emb_gen])[0][0] |
| return sim |
|
|
| def perplexity_score(text): |
| inputs = tokenizer(text, return_tensors="pt") |
| with torch.no_grad(): |
| outputs = perplexity_model(**inputs, labels=inputs["input_ids"]) |
| loss = outputs.loss |
| return torch.exp(loss).item() |
|
|
| def precision_at_k(retrieved, relevant, k): |
| top_k = retrieved[:k] |
| correct = sum(1 for item in top_k if item in relevant) |
| return correct / k |
|
|
| def recall_at_k(retrieved, relevant, k): |
| correct = sum(1 for item in retrieved[:k] if item in relevant) |
| return correct / len(relevant) |
|
|
| def ndcg_at_k(retrieved, relevant, k): |
| def dcg(items): |
| return sum([1 / np.log2(i+2) if items[i] in relevant else 0 for i in range(len(items))]) |
| ideal = dcg(relevant[:k]) |
| actual = dcg(retrieved[:k]) |
| return actual / ideal if ideal != 0 else 0 |
|
|
| def hit_at_k(retrieved, relevant, k): |
| top_k = retrieved[:k] |
| return int(any(item in relevant for item in top_k)) |
|
|
| |
| def full_evaluation(reference, generated, retrieved, relevant_chunks): |
| return { |
| **bleu_rouge_score(reference, generated), |
| "cosine_similarity": cosine_sim(reference, generated), |
| "perplexity": perplexity_score(generated), |
| "precision@5": precision_at_k(retrieved, relevant_chunks, 5), |
| "recall@5": recall_at_k(retrieved, relevant_chunks, 5), |
| "ndcg@5": ndcg_at_k(retrieved, relevant_chunks, 5), |
| "hit@5": hit_at_k(retrieved, relevant_chunks, 5) |
| } |
|
|
| |
| if __name__ == "__main__": |
| reference_answer = "The Eiffel Tower is located in Paris." |
| generated_response = "Eiffel Tower stands in Paris." |
|
|
| retrieved_chunks = [ |
| "The Eiffel Tower is a landmark in Paris.", |
| "Paris is the capital of France.", |
| "The Louvre is also in Paris.", |
| "Eiffel Tower was built in 1889.", |
| "It is a famous tourist spot." |
| ] |
|
|
| relevant_chunks = [ |
| "The Eiffel Tower is a landmark in Paris.", |
| "Eiffel Tower was built in 1889." |
| ] |
|
|
| scores = full_evaluation(reference_answer, generated_response, retrieved_chunks, relevant_chunks) |
|
|
| for metric, score in scores.items(): |
| print(f"{metric}: {score:.4f}" if isinstance(score, float) else f"{metric}: {score}") |
|
|
| --------------------------- |
| |
| |
|
|
| import fitz |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from sentence_transformers import SentenceTransformer, util |
| import openai |
|
|
| |
| openai.api_key = "YOUR_OPENAI_API_KEY" |
|
|
| |
| def extract_text_from_pdf(pdf_path): |
| doc = fitz.open(pdf_path) |
| text = "\n".join([page.get_text() for page in doc]) |
| return text |
|
|
| |
| def split_text(text): |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
| return text_splitter.split_text(text) |
|
|
| |
| def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2'): |
| model = SentenceTransformer(model_name) |
|
|
| retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
| relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) |
|
|
| cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
|
|
| print("Cosine Similarity Matrix (rows: retrieved, columns: relevant queries):\n") |
| relevant_scores = [] |
| |
| for i, retrieved in enumerate(retrieved_chunks): |
| for j, relevant in enumerate(relevant_queries): |
| score = cosine_sim_matrix[i][j].item() |
| relevant_scores.append((retrieved, relevant, score)) |
| print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Query : \"{relevant}\"\n Score : {score:.4f}\n") |
|
|
| |
| relevant_scores.sort(key=lambda x: x[2], reverse=True) |
| return [x[0] for x in relevant_scores[:5]] |
|
|
| |
| def query_openai(prompt): |
| response = openai.ChatCompletion.create( |
| model="gpt-4", |
| messages=[{"role": "system", "content": "You are an assistant."}, |
| {"role": "user", "content": prompt}] |
| ) |
| return response["choices"][0]["message"]["content"] |
|
|
| |
| def process_pdf(pdf_path): |
| |
| extracted_text = extract_text_from_pdf(pdf_path) |
|
|
| |
| retrieved_chunks = split_text(extracted_text) |
|
|
| |
| relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] |
| |
| |
| top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) |
|
|
| |
| prompt = f"Summarize this information: {' '.join(top_chunks)}" |
| openai_response = query_openai(prompt) |
|
|
| print("\nOpenAI Response:\n", openai_response) |
|
|
| |
| process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") |
|
|
| |
| |
| def extract_text_from_pdf(pdf_path): |
| doc = fitz.open(pdf_path) |
| text = "\n".join([page.get_text() for page in doc]) |
| return text |
| |
| |
| def split_text(text): |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
| return text_splitter.split_text(text) |
| |
| |
| def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4): |
| model = SentenceTransformer(model_name) |
| |
| |
| retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()] |
| relevant_queries = [q for q in relevant_queries if q.strip()] |
| |
| |
| if not retrieved_chunks: |
| raise ValueError("retrieved_chunks is empty!") |
| if not relevant_queries: |
| raise ValueError("relevant_queries is empty!") |
| |
| |
| retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
| relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) |
| |
| |
| cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
| |
| |
| print("Cosine Similarity Matrix (rows: chunks, columns: queries):\n", cosine_sim_matrix) |
| |
| |
| relevant_scores = [] |
| for i, retrieved in enumerate(retrieved_chunks): |
| for j, relevant in enumerate(relevant_queries): |
| score = cosine_sim_matrix[i][j].item() |
| relevant_scores.append((retrieved, relevant, score)) |
| |
| |
| relevant_scores.sort(key=lambda x: x[2], reverse=True) |
| |
| |
| print("\nTop Relevant Chunks and Scores:") |
| for r, q, s in relevant_scores[:top_k]: |
| print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}") |
| |
| |
| filtered = [x for x in relevant_scores if x[2] >= similarity_threshold] |
| top_filtered = filtered[:top_k] |
| |
| return [x[0] for x in top_filtered] |
| |
| |
| def query_openai(prompt): |
| response = openai.ChatCompletion.create( |
| model="gpt-4", |
| messages=[ |
| {"role": "system", "content": "You are an assistant."}, |
| {"role": "user", "content": prompt} |
| ] |
| ) |
| return response["choices"][0]["message"]["content"] |
| |
| |
| def process_pdf(pdf_path): |
| |
| extracted_text = extract_text_from_pdf(pdf_path) |
| |
| |
| retrieved_chunks = split_text(extracted_text) |
| |
| |
| relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] |
| |
| |
| top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) |
| |
| |
| prompt = ( |
| f"You are a historical assistant. Summarize the following content " |
| f"in context of the queries: {', '.join(relevant_queries)}.\n\n" |
| f"Relevant content:\n{'\n\n'.join(top_chunks)}" |
| ) |
| openai_response = query_openai(prompt) |
| |
| print("\nOpenAI Response:\n", openai_response) |
| |
| |
| if __name__ == "__main__": |
| process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf") |
|
|
|
|
|
|
| ======================================================================= |
|
|
| import fitz |
| import pytesseract |
| from pdf2image import convert_from_path |
| from PIL import Image |
|
|
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from sentence_transformers import SentenceTransformer, util |
| import openai |
|
|
| |
| |
|
|
| openai.api_key = "your-api-key-here" |
|
|
| |
| def extract_text_from_pdf(pdf_path): |
| doc = fitz.open(pdf_path) |
| text = "\n".join([page.get_text() for page in doc]) |
|
|
| if text.strip(): |
| print("Text extracted using fitz.") |
| return text |
|
|
| |
| print("No extractable text found. Trying OCR instead...") |
| images = convert_from_path(pdf_path) |
| ocr_text = "\n".join(pytesseract.image_to_string(image) for image in images) |
| return ocr_text |
|
|
| |
| def split_text(text): |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30) |
| return text_splitter.split_text(text) |
|
|
| |
| def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4): |
| model = SentenceTransformer(model_name) |
| retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()] |
| relevant_queries = [q for q in relevant_queries if q.strip()] |
|
|
| if not retrieved_chunks: |
| raise ValueError("retrieved_chunks is empty!") |
| if not relevant_queries: |
| raise ValueError("relevant_queries is empty!") |
|
|
| retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
| relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) |
| cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
|
|
| relevant_scores = [] |
| for i, retrieved in enumerate(retrieved_chunks): |
| for j, relevant in enumerate(relevant_queries): |
| score = cosine_sim_matrix[i][j].item() |
| relevant_scores.append((retrieved, relevant, score)) |
|
|
| relevant_scores.sort(key=lambda x: x[2], reverse=True) |
| filtered = [x for x in relevant_scores if x[2] >= similarity_threshold] |
| top_filtered = filtered[:top_k] |
|
|
| for r, q, s in top_filtered: |
| print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}") |
|
|
| return [x[0] for x in top_filtered] |
|
|
| |
| def query_openai(prompt): |
| response = openai.ChatCompletion.create( |
| model="gpt-4", |
| messages=[ |
| {"role": "system", "content": "You are an assistant."}, |
| {"role": "user", "content": prompt} |
| ] |
| ) |
| return response["choices"][0]["message"]["content"] |
|
|
| |
| def process_pdf(pdf_path): |
| extracted_text = extract_text_from_pdf(pdf_path) |
| print("Extracted text length:", len(extracted_text)) |
|
|
| retrieved_chunks = split_text(extracted_text) |
| print("Number of chunks created:", len(retrieved_chunks)) |
|
|
| relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] |
|
|
| top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) |
|
|
| prompt = ( |
| f"You are a historical assistant. Summarize the following content " |
| f"in context of the queries: {', '.join(relevant_queries)}.\n\n" |
| f"Relevant content:\n{'\n\n'.join(top_chunks)}" |
| ) |
|
|
| openai_response = query_openai(prompt) |
| print("\nOpenAI Response:\n", openai_response) |
|
|
| |
| if __name__ == "__main__": |
| process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf") |
|
|
|
|
| ======================================= |
| def openai_llm_call(prompt: str, persona: str) -> str: |
| |
| client = AzureOpenAI( |
| api_version=os.getenv("API_VERSION"), |
| azure_endpoint=os.getenv("ENDPOINT"), |
| api_key=os.getenv("OPENAI_API_KEY"), |
| ) |
| response = client.chat.completions.create( |
| messages=[ |
| { |
| "role": "system", |
| "content": persona, |
| }, |
| { |
| "role": "user", |
| "content": prompt, |
| } |
| ], |
| model=os.getenv("DEPLOYMENT") |
| ) |
| |
| return response.choices[0].message.content |
|
|