from sentence_transformers import SentenceTransformer, util print("import done") # Input chunks retrieved_chunks = [ "The Eiffel Tower is a landmark in Paris.", "Paris is the capital of France.", "The Louvre is also in Paris.", "Eiffel Tower was built in 1889.", "It is a famous tourist spot." ] relevant_chunks = [ "The Eiffel Tower is a landmark in Paris.", "Eiffel Tower was built in 1889." ] # Load sentence transformer model model = SentenceTransformer('all-MiniLM-L6-v2') # Compute embeddings retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) relevant_embeddings = model.encode(relevant_chunks, convert_to_tensor=True) # Calculate pairwise cosine similarities cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) # Print similarity matrix print("Cosine Similarity Matrix (rows: retrieved, columns: relevant):\n") for i, retrieved in enumerate(retrieved_chunks): for j, relevant in enumerate(relevant_chunks): score = cosine_sim_matrix[i][j].item() print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Relevant : \"{relevant}\"\n Score : {score:.4f}\n")------ ----------------------------------------- import numpy as np from nltk.translate.bleu_score import sentence_bleu from rouge_score import rouge_scorer from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from transformers import GPT2Tokenizer, GPT2LMHeadModel import torch #client = genai.Client(api_key=os.getenv("GEMINI_API_KEY")) # Load models embedding_model = SentenceTransformer('all-MiniLM-L6-v2') tokenizer = GPT2Tokenizer.from_pretrained("gpt2") perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2") perplexity_model.eval() # Evaluation Metrics def bleu_rouge_score(reference, generated): bleu = sentence_bleu([reference.split()], generated.split()) rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) rougeL = rouge.score(reference, generated)['rougeL'].fmeasure return {'bleu': bleu, 'rougeL': rougeL} def cosine_sim(reference, generated): emb_ref = embedding_model.encode([reference])[0] emb_gen = embedding_model.encode([generated])[0] sim = cosine_similarity([emb_ref], [emb_gen])[0][0] return sim def perplexity_score(text): inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): outputs = perplexity_model(**inputs, labels=inputs["input_ids"]) loss = outputs.loss return torch.exp(loss).item() def precision_at_k(retrieved, relevant, k): top_k = retrieved[:k] correct = sum(1 for item in top_k if item in relevant) return correct / k def recall_at_k(retrieved, relevant, k): correct = sum(1 for item in retrieved[:k] if item in relevant) return correct / len(relevant) def ndcg_at_k(retrieved, relevant, k): def dcg(items): return sum([1 / np.log2(i+2) if items[i] in relevant else 0 for i in range(len(items))]) ideal = dcg(relevant[:k]) actual = dcg(retrieved[:k]) return actual / ideal if ideal != 0 else 0 def hit_at_k(retrieved, relevant, k): top_k = retrieved[:k] return int(any(item in relevant for item in top_k)) # Main Evaluation def full_evaluation(reference, generated, retrieved, relevant_chunks): return { **bleu_rouge_score(reference, generated), "cosine_similarity": cosine_sim(reference, generated), "perplexity": perplexity_score(generated), "precision@5": precision_at_k(retrieved, relevant_chunks, 5), "recall@5": recall_at_k(retrieved, relevant_chunks, 5), "ndcg@5": ndcg_at_k(retrieved, relevant_chunks, 5), "hit@5": hit_at_k(retrieved, relevant_chunks, 5) } # Sample Run if __name__ == "__main__": reference_answer = "The Eiffel Tower is located in Paris." generated_response = "Eiffel Tower stands in Paris." retrieved_chunks = [ "The Eiffel Tower is a landmark in Paris.", "Paris is the capital of France.", "The Louvre is also in Paris.", "Eiffel Tower was built in 1889.", "It is a famous tourist spot." ] relevant_chunks = [ "The Eiffel Tower is a landmark in Paris.", "Eiffel Tower was built in 1889." ] scores = full_evaluation(reference_answer, generated_response, retrieved_chunks, relevant_chunks) for metric, score in scores.items(): print(f"{metric}: {score:.4f}" if isinstance(score, float) else f"{metric}: {score}") --------------------------- # Install dependencies first (if not already installed) # pip install openai sentence-transformers langchain pymupdf import fitz # PyMuPDF for PDF text extraction from langchain.text_splitter import RecursiveCharacterTextSplitter from sentence_transformers import SentenceTransformer, util import openai # Configure OpenAI API Key openai.api_key = "YOUR_OPENAI_API_KEY" # Replace with your actual OpenAI API key ### **Step 1: Extract Text from PDF** def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "\n".join([page.get_text() for page in doc]) return text ### **Step 2: Split Text Using RecursiveCharacterTextSplitter** def split_text(text): text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) return text_splitter.split_text(text) ### **Step 3: Compute Embeddings for Chunk Retrieval** def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2'): model = SentenceTransformer(model_name) retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) print("Cosine Similarity Matrix (rows: retrieved, columns: relevant queries):\n") relevant_scores = [] for i, retrieved in enumerate(retrieved_chunks): for j, relevant in enumerate(relevant_queries): score = cosine_sim_matrix[i][j].item() relevant_scores.append((retrieved, relevant, score)) print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Query : \"{relevant}\"\n Score : {score:.4f}\n") # Sort and return the top-K most relevant chunks relevant_scores.sort(key=lambda x: x[2], reverse=True) return [x[0] for x in relevant_scores[:5]] # Adjust top-K as needed ### **Step 4: Pass Top-K Chunks to OpenAI LLM** def query_openai(prompt): response = openai.ChatCompletion.create( model="gpt-4", # You can change this to "gpt-3.5-turbo" or another model messages=[{"role": "system", "content": "You are an assistant."}, {"role": "user", "content": prompt}] ) return response["choices"][0]["message"]["content"] ### **Final Workflow** def process_pdf(pdf_path): # Step 1: Extract text from PDF extracted_text = extract_text_from_pdf(pdf_path) # Step 2: Split text into chunks retrieved_chunks = split_text(extracted_text) # Step 3: Define relevant queries (modify as per your needs) relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] # Step 4: Retrieve top-K relevant chunks top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) # Step 5: Query OpenAI LLM with relevant chunks prompt = f"Summarize this information: {' '.join(top_chunks)}" openai_response = query_openai(prompt) print("\nOpenAI Response:\n", openai_response) # Run the pipeline with a sample PDF file process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf") # Replace with your actual PDF file path ############################################################################### ### Step 1: Extract Text from PDF ### def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "\n".join([page.get_text() for page in doc]) return text ### Step 2: Split Text Using RecursiveCharacterTextSplitter ### def split_text(text): text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) return text_splitter.split_text(text) ### Step 3: Compute Embeddings for Chunk Retrieval ### def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4): model = SentenceTransformer(model_name) # Clean up empty strings retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()] relevant_queries = [q for q in relevant_queries if q.strip()] # Debug check if not retrieved_chunks: raise ValueError("retrieved_chunks is empty!") if not relevant_queries: raise ValueError("relevant_queries is empty!") # Compute embeddings retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) # Cosine similarity matrix: (retrieved x queries) cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) # Log similarity matrix print("Cosine Similarity Matrix (rows: chunks, columns: queries):\n", cosine_sim_matrix) # Score all pairs relevant_scores = [] for i, retrieved in enumerate(retrieved_chunks): for j, relevant in enumerate(relevant_queries): score = cosine_sim_matrix[i][j].item() relevant_scores.append((retrieved, relevant, score)) # Sort by score descending relevant_scores.sort(key=lambda x: x[2], reverse=True) # Log top matches print("\nTop Relevant Chunks and Scores:") for r, q, s in relevant_scores[:top_k]: print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}") # Apply threshold filtered = [x for x in relevant_scores if x[2] >= similarity_threshold] top_filtered = filtered[:top_k] return [x[0] for x in top_filtered] ### Step 4: Pass Top-K Chunks to OpenAI LLM ### def query_openai(prompt): response = openai.ChatCompletion.create( model="gpt-4", # Or "gpt-3.5-turbo" messages=[ {"role": "system", "content": "You are an assistant."}, {"role": "user", "content": prompt} ] ) return response["choices"][0]["message"]["content"] ### Final Workflow ### def process_pdf(pdf_path): # Step 1: Extract text extracted_text = extract_text_from_pdf(pdf_path) # Step 2: Split into chunks retrieved_chunks = split_text(extracted_text) # Step 3: Define queries relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] # Step 4: Retrieve top-K relevant chunks top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) # Step 5: Query OpenAI prompt = ( f"You are a historical assistant. Summarize the following content " f"in context of the queries: {', '.join(relevant_queries)}.\n\n" f"Relevant content:\n{'\n\n'.join(top_chunks)}" ) openai_response = query_openai(prompt) print("\nOpenAI Response:\n", openai_response) ### Run the pipeline on your sample PDF ### if __name__ == "__main__": process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf") ======================================================================= import fitz import pytesseract from pdf2image import convert_from_path from PIL import Image from langchain.text_splitter import RecursiveCharacterTextSplitter from sentence_transformers import SentenceTransformer, util import openai # For Windows users: uncomment and set path if needed # pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' openai.api_key = "your-api-key-here" # 🔐 Replace with your actual OpenAI API key ### Step 1: Extract Text from PDF or Fallback to OCR ### def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "\n".join([page.get_text() for page in doc]) if text.strip(): # If text was extracted from PDF print("Text extracted using fitz.") return text # OCR fallback for scanned/image-based PDFs print("No extractable text found. Trying OCR instead...") images = convert_from_path(pdf_path) ocr_text = "\n".join(pytesseract.image_to_string(image) for image in images) return ocr_text ### Step 2: Split Text into Chunks ### def split_text(text): text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30) return text_splitter.split_text(text) ### Step 3: Get Top-K Relevant Chunks Based on Query ### def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4): model = SentenceTransformer(model_name) retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()] relevant_queries = [q for q in relevant_queries if q.strip()] if not retrieved_chunks: raise ValueError("retrieved_chunks is empty!") if not relevant_queries: raise ValueError("relevant_queries is empty!") retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True) relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True) cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings) relevant_scores = [] for i, retrieved in enumerate(retrieved_chunks): for j, relevant in enumerate(relevant_queries): score = cosine_sim_matrix[i][j].item() relevant_scores.append((retrieved, relevant, score)) relevant_scores.sort(key=lambda x: x[2], reverse=True) filtered = [x for x in relevant_scores if x[2] >= similarity_threshold] top_filtered = filtered[:top_k] for r, q, s in top_filtered: print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}") return [x[0] for x in top_filtered] ### Step 4: Query OpenAI LLM with Prompt ### def query_openai(prompt): response = openai.ChatCompletion.create( model="gpt-4", messages=[ {"role": "system", "content": "You are an assistant."}, {"role": "user", "content": prompt} ] ) return response["choices"][0]["message"]["content"] ### Final Workflow ### def process_pdf(pdf_path): extracted_text = extract_text_from_pdf(pdf_path) print("Extracted text length:", len(extracted_text)) retrieved_chunks = split_text(extracted_text) print("Number of chunks created:", len(retrieved_chunks)) relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"] top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries) prompt = ( f"You are a historical assistant. Summarize the following content " f"in context of the queries: {', '.join(relevant_queries)}.\n\n" f"Relevant content:\n{'\n\n'.join(top_chunks)}" ) openai_response = query_openai(prompt) print("\nOpenAI Response:\n", openai_response) ### Run the pipeline ### if __name__ == "__main__": process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf") ======================================= def openai_llm_call(prompt: str, persona: str) -> str: client = AzureOpenAI( api_version=os.getenv("API_VERSION"), azure_endpoint=os.getenv("ENDPOINT"), api_key=os.getenv("OPENAI_API_KEY"), ) response = client.chat.completions.create( messages=[ { "role": "system", "content": persona, }, { "role": "user", "content": prompt, } ], model=os.getenv("DEPLOYMENT") ) return response.choices[0].message.content