| | from sentence_transformers import SentenceTransformer, util |
| | print("import done") |
| | |
| | retrieved_chunks = [ |
| | "The Eiffel Tower is a landmark in Paris.", |
| | "Paris is the capital of France.", |
| | "The Louvre is also in Paris.", |
| | "Eiffel Tower was built in 1889.", |
| | "It is a famous tourist spot." |
| | ] |
| |
|
| | relevant_chunks = [ |
| | "The Eiffel Tower is a landmark in Paris.", |
| | "Eiffel Tower was built in 1889." |
| | ] |
| |
|
| | |
| | model = SentenceTransformer('all-MiniLM-L6-v2') |
| |
|
| | |
| | retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
| | relevant_embeddings = model.encode(relevant_chunks, convert_to_tensor=True) |
| |
|
| | |
| | cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
| |
|
| | |
| | print("Cosine Similarity Matrix (rows: retrieved, columns: relevant):\n") |
| | for i, retrieved in enumerate(retrieved_chunks): |
| | for j, relevant in enumerate(relevant_chunks): |
| | score = cosine_sim_matrix[i][j].item() |
| | print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Relevant : \"{relevant}\"\n Score : {score:.4f}\n")------ |
| | ----------------------------------------- |
| |
|
| |
|
| | import numpy as np |
| | from nltk.translate.bleu_score import sentence_bleu |
| | from rouge_score import rouge_scorer |
| | from sentence_transformers import SentenceTransformer |
| | from sklearn.metrics.pairwise import cosine_similarity |
| | from transformers import GPT2Tokenizer, GPT2LMHeadModel |
| | import torch |
| |
|
| | |
| |
|
| | |
| | embedding_model = SentenceTransformer('all-MiniLM-L6-v2') |
| | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
| | perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2") |
| | perplexity_model.eval() |
| |
|
| | |
| | def bleu_rouge_score(reference, generated): |
| | bleu = sentence_bleu([reference.split()], generated.split()) |
| | rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) |
| | rougeL = rouge.score(reference, generated)['rougeL'].fmeasure |
| | return {'bleu': bleu, 'rougeL': rougeL} |
| |
|
| | def cosine_sim(reference, generated): |
| | emb_ref = embedding_model.encode([reference])[0] |
| | emb_gen = embedding_model.encode([generated])[0] |
| | sim = cosine_similarity([emb_ref], [emb_gen])[0][0] |
| | return sim |
| |
|
| | def perplexity_score(text): |
| | inputs = tokenizer(text, return_tensors="pt") |
| | with torch.no_grad(): |
| | outputs = perplexity_model(**inputs, labels=inputs["input_ids"]) |
| | loss = outputs.loss |
| | return torch.exp(loss).item() |
| |
|
| | def precision_at_k(retrieved, relevant, k): |
| | top_k = retrieved[:k] |
| | correct = sum(1 for item in top_k if item in relevant) |
| | return correct / k |
| |
|
| | def recall_at_k(retrieved, relevant, k): |
| | correct = sum(1 for item in retrieved[:k] if item in relevant) |
| | return correct / len(relevant) |
| |
|
| | def ndcg_at_k(retrieved, relevant, k): |
| | def dcg(items): |
| | return sum([1 / np.log2(i+2) if items[i] in relevant else 0 for i in range(len(items))]) |
| | ideal = dcg(relevant[:k]) |
| | actual = dcg(retrieved[:k]) |
| | return actual / ideal if ideal != 0 else 0 |
| |
|
| | def hit_at_k(retrieved, relevant, k): |
| | top_k = retrieved[:k] |
| | return int(any(item in relevant for item in top_k)) |
| |
|
| | |
| | def full_evaluation(reference, generated, retrieved, relevant_chunks): |
| | return { |
| | **bleu_rouge_score(reference, generated), |
| | "cosine_similarity": cosine_sim(reference, generated), |
| | "perplexity": perplexity_score(generated), |
| | "precision@5": precision_at_k(retrieved, relevant_chunks, 5), |
| | "recall@5": recall_at_k(retrieved, relevant_chunks, 5), |
| | "ndcg@5": ndcg_at_k(retrieved, relevant_chunks, 5), |
| | "hit@5": hit_at_k(retrieved, relevant_chunks, 5) |
| | } |
| |
|
| | |
| | if __name__ == "__main__": |
| | reference_answer = "The Eiffel Tower is located in Paris." |
| | generated_response = "Eiffel Tower stands in Paris." |
| |
|
| | retrieved_chunks = [ |
| | "The Eiffel Tower is a landmark in Paris.", |
| | "Paris is the capital of France.", |
| | "The Louvre is also in Paris.", |
| | "Eiffel Tower was built in 1889.", |
| | "It is a famous tourist spot." |
| | ] |
| |
|
| | relevant_chunks = [ |
| | "The Eiffel Tower is a landmark in Paris.", |
| | "Eiffel Tower was built in 1889." |
| | ] |
| |
|
| | scores = full_evaluation(reference_answer, generated_response, retrieved_chunks, relevant_chunks) |
| |
|
| | for metric, score in scores.items(): |
| | print(f"{metric}: {score:.4f}" if isinstance(score, float) else f"{metric}: {score}") |
| |
|
| | --------------------------- |
| | |
| | |
| |
|
| | import fitz |
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| | from sentence_transformers import SentenceTransformer, util |
| | import openai |
| |
|
| | |
| | openai.api_key = "YOUR_OPENAI_API_KEY" |
| |
|
| | |
| | def extract_text_from_pdf(pdf_path): |
| | doc = fitz.open(pdf_path) |
| | text = "\n".join([page.get_text() for page in doc]) |
| | return text |
| |
|
| | |
| | def split_text(text): |
| | text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
| | return text_splitter.split_text(text) |
| |
|
| | |
| | def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2'): |
| | model = SentenceTransformer(model_name) |
| |
|
| | retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
| | relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) |
| |
|
| | cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
| |
|
| | print("Cosine Similarity Matrix (rows: retrieved, columns: relevant queries):\n") |
| | relevant_scores = [] |
| | |
| | for i, retrieved in enumerate(retrieved_chunks): |
| | for j, relevant in enumerate(relevant_queries): |
| | score = cosine_sim_matrix[i][j].item() |
| | relevant_scores.append((retrieved, relevant, score)) |
| | print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Query : \"{relevant}\"\n Score : {score:.4f}\n") |
| |
|
| | |
| | relevant_scores.sort(key=lambda x: x[2], reverse=True) |
| | return [x[0] for x in relevant_scores[:5]] |
| |
|
| | |
| | def query_openai(prompt): |
| | response = openai.ChatCompletion.create( |
| | model="gpt-4", |
| | messages=[{"role": "system", "content": "You are an assistant."}, |
| | {"role": "user", "content": prompt}] |
| | ) |
| | return response["choices"][0]["message"]["content"] |
| |
|
| | |
| | def process_pdf(pdf_path): |
| | |
| | extracted_text = extract_text_from_pdf(pdf_path) |
| |
|
| | |
| | retrieved_chunks = split_text(extracted_text) |
| |
|
| | |
| | relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] |
| | |
| | |
| | top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) |
| |
|
| | |
| | prompt = f"Summarize this information: {' '.join(top_chunks)}" |
| | openai_response = query_openai(prompt) |
| |
|
| | print("\nOpenAI Response:\n", openai_response) |
| |
|
| | |
| | process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") |
| |
|
| | |
| | |
| | def extract_text_from_pdf(pdf_path): |
| | doc = fitz.open(pdf_path) |
| | text = "\n".join([page.get_text() for page in doc]) |
| | return text |
| | |
| | |
| | def split_text(text): |
| | text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
| | return text_splitter.split_text(text) |
| | |
| | |
| | def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4): |
| | model = SentenceTransformer(model_name) |
| | |
| | |
| | retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()] |
| | relevant_queries = [q for q in relevant_queries if q.strip()] |
| | |
| | |
| | if not retrieved_chunks: |
| | raise ValueError("retrieved_chunks is empty!") |
| | if not relevant_queries: |
| | raise ValueError("relevant_queries is empty!") |
| | |
| | |
| | retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
| | relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) |
| | |
| | |
| | cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
| | |
| | |
| | print("Cosine Similarity Matrix (rows: chunks, columns: queries):\n", cosine_sim_matrix) |
| | |
| | |
| | relevant_scores = [] |
| | for i, retrieved in enumerate(retrieved_chunks): |
| | for j, relevant in enumerate(relevant_queries): |
| | score = cosine_sim_matrix[i][j].item() |
| | relevant_scores.append((retrieved, relevant, score)) |
| | |
| | |
| | relevant_scores.sort(key=lambda x: x[2], reverse=True) |
| | |
| | |
| | print("\nTop Relevant Chunks and Scores:") |
| | for r, q, s in relevant_scores[:top_k]: |
| | print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}") |
| | |
| | |
| | filtered = [x for x in relevant_scores if x[2] >= similarity_threshold] |
| | top_filtered = filtered[:top_k] |
| | |
| | return [x[0] for x in top_filtered] |
| | |
| | |
| | def query_openai(prompt): |
| | response = openai.ChatCompletion.create( |
| | model="gpt-4", |
| | messages=[ |
| | {"role": "system", "content": "You are an assistant."}, |
| | {"role": "user", "content": prompt} |
| | ] |
| | ) |
| | return response["choices"][0]["message"]["content"] |
| | |
| | |
| | def process_pdf(pdf_path): |
| | |
| | extracted_text = extract_text_from_pdf(pdf_path) |
| | |
| | |
| | retrieved_chunks = split_text(extracted_text) |
| | |
| | |
| | relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] |
| | |
| | |
| | top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) |
| | |
| | |
| | prompt = ( |
| | f"You are a historical assistant. Summarize the following content " |
| | f"in context of the queries: {', '.join(relevant_queries)}.\n\n" |
| | f"Relevant content:\n{'\n\n'.join(top_chunks)}" |
| | ) |
| | openai_response = query_openai(prompt) |
| | |
| | print("\nOpenAI Response:\n", openai_response) |
| | |
| | |
| | if __name__ == "__main__": |
| | process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf") |
| |
|
| |
|
| |
|
| | ======================================================================= |
| |
|
| | import fitz |
| | import pytesseract |
| | from pdf2image import convert_from_path |
| | from PIL import Image |
| |
|
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| | from sentence_transformers import SentenceTransformer, util |
| | import openai |
| |
|
| | |
| | |
| |
|
| | openai.api_key = "your-api-key-here" |
| |
|
| | |
| | def extract_text_from_pdf(pdf_path): |
| | doc = fitz.open(pdf_path) |
| | text = "\n".join([page.get_text() for page in doc]) |
| |
|
| | if text.strip(): |
| | print("Text extracted using fitz.") |
| | return text |
| |
|
| | |
| | print("No extractable text found. Trying OCR instead...") |
| | images = convert_from_path(pdf_path) |
| | ocr_text = "\n".join(pytesseract.image_to_string(image) for image in images) |
| | return ocr_text |
| |
|
| | |
| | def split_text(text): |
| | text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30) |
| | return text_splitter.split_text(text) |
| |
|
| | |
| | def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4): |
| | model = SentenceTransformer(model_name) |
| | retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()] |
| | relevant_queries = [q for q in relevant_queries if q.strip()] |
| |
|
| | if not retrieved_chunks: |
| | raise ValueError("retrieved_chunks is empty!") |
| | if not relevant_queries: |
| | raise ValueError("relevant_queries is empty!") |
| |
|
| | retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) |
| | relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) |
| | cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) |
| |
|
| | relevant_scores = [] |
| | for i, retrieved in enumerate(retrieved_chunks): |
| | for j, relevant in enumerate(relevant_queries): |
| | score = cosine_sim_matrix[i][j].item() |
| | relevant_scores.append((retrieved, relevant, score)) |
| |
|
| | relevant_scores.sort(key=lambda x: x[2], reverse=True) |
| | filtered = [x for x in relevant_scores if x[2] >= similarity_threshold] |
| | top_filtered = filtered[:top_k] |
| |
|
| | for r, q, s in top_filtered: |
| | print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}") |
| |
|
| | return [x[0] for x in top_filtered] |
| |
|
| | |
| | def query_openai(prompt): |
| | response = openai.ChatCompletion.create( |
| | model="gpt-4", |
| | messages=[ |
| | {"role": "system", "content": "You are an assistant."}, |
| | {"role": "user", "content": prompt} |
| | ] |
| | ) |
| | return response["choices"][0]["message"]["content"] |
| |
|
| | |
| | def process_pdf(pdf_path): |
| | extracted_text = extract_text_from_pdf(pdf_path) |
| | print("Extracted text length:", len(extracted_text)) |
| |
|
| | retrieved_chunks = split_text(extracted_text) |
| | print("Number of chunks created:", len(retrieved_chunks)) |
| |
|
| | relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] |
| |
|
| | top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) |
| |
|
| | prompt = ( |
| | f"You are a historical assistant. Summarize the following content " |
| | f"in context of the queries: {', '.join(relevant_queries)}.\n\n" |
| | f"Relevant content:\n{'\n\n'.join(top_chunks)}" |
| | ) |
| |
|
| | openai_response = query_openai(prompt) |
| | print("\nOpenAI Response:\n", openai_response) |
| |
|
| | |
| | if __name__ == "__main__": |
| | process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf") |
| |
|
| |
|
| | ======================================= |
| | def openai_llm_call(prompt: str, persona: str) -> str: |
| | |
| | client = AzureOpenAI( |
| | api_version=os.getenv("API_VERSION"), |
| | azure_endpoint=os.getenv("ENDPOINT"), |
| | api_key=os.getenv("OPENAI_API_KEY"), |
| | ) |
| | response = client.chat.completions.create( |
| | messages=[ |
| | { |
| | "role": "system", |
| | "content": persona, |
| | }, |
| | { |
| | "role": "user", |
| | "content": prompt, |
| | } |
| | ], |
| | model=os.getenv("DEPLOYMENT") |
| | ) |
| | |
| | return response.choices[0].message.content |
| |
|