File size: 15,749 Bytes
58fe6ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dfc466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2243a4b
 
c90b64e
2243a4b
 
 
 
c90b64e
2243a4b
c90b64e
 
2243a4b
 
 
 
 
 
 
 
 
 
 
 
 
c90b64e
2243a4b
 
 
c90b64e
2243a4b
 
 
c90b64e
2243a4b
 
 
c90b64e
2243a4b
 
c90b64e
2243a4b
 
 
 
 
c90b64e
 
 
 
 
 
 
 
2243a4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c90b64e
2243a4b
c90b64e
2243a4b
c90b64e
2243a4b
 
c90b64e
4710e39
84f3ab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4710e39
c90b64e
0be4a76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dda0320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
from sentence_transformers import SentenceTransformer, util
print("import done")
# Input chunks
retrieved_chunks = [
    "The Eiffel Tower is a landmark in Paris.",
    "Paris is the capital of France.",
    "The Louvre is also in Paris.",
    "Eiffel Tower was built in 1889.",
    "It is a famous tourist spot."
]

relevant_chunks = [
    "The Eiffel Tower is a landmark in Paris.",
    "Eiffel Tower was built in 1889."
]

# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings
retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
relevant_embeddings = model.encode(relevant_chunks, convert_to_tensor=True)

# Calculate pairwise cosine similarities
cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)

# Print similarity matrix
print("Cosine Similarity Matrix (rows: retrieved, columns: relevant):\n")
for i, retrieved in enumerate(retrieved_chunks):
    for j, relevant in enumerate(relevant_chunks):
        score = cosine_sim_matrix[i][j].item()
        print(f"Similarity between:\n  Retrieved: \"{retrieved}\"\n  Relevant : \"{relevant}\"\n  Score    : {score:.4f}\n")------
        -----------------------------------------


        import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

#client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

# Load models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2")
perplexity_model.eval()

# Evaluation Metrics
def bleu_rouge_score(reference, generated):
    bleu = sentence_bleu([reference.split()], generated.split())
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rougeL = rouge.score(reference, generated)['rougeL'].fmeasure
    return {'bleu': bleu, 'rougeL': rougeL}

def cosine_sim(reference, generated):
    emb_ref = embedding_model.encode([reference])[0]
    emb_gen = embedding_model.encode([generated])[0]
    sim = cosine_similarity([emb_ref], [emb_gen])[0][0]
    return sim

def perplexity_score(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = perplexity_model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return torch.exp(loss).item()

def precision_at_k(retrieved, relevant, k):
    top_k = retrieved[:k]
    correct = sum(1 for item in top_k if item in relevant)
    return correct / k

def recall_at_k(retrieved, relevant, k):
    correct = sum(1 for item in retrieved[:k] if item in relevant)
    return correct / len(relevant)

def ndcg_at_k(retrieved, relevant, k):
    def dcg(items):
        return sum([1 / np.log2(i+2) if items[i] in relevant else 0 for i in range(len(items))])
    ideal = dcg(relevant[:k])
    actual = dcg(retrieved[:k])
    return actual / ideal if ideal != 0 else 0

def hit_at_k(retrieved, relevant, k):
    top_k = retrieved[:k]
    return int(any(item in relevant for item in top_k))

# Main Evaluation
def full_evaluation(reference, generated, retrieved, relevant_chunks):
    return {
        **bleu_rouge_score(reference, generated),
        "cosine_similarity": cosine_sim(reference, generated),
        "perplexity": perplexity_score(generated),
        "precision@5": precision_at_k(retrieved, relevant_chunks, 5),
        "recall@5": recall_at_k(retrieved, relevant_chunks, 5),
        "ndcg@5": ndcg_at_k(retrieved, relevant_chunks, 5),
        "hit@5": hit_at_k(retrieved, relevant_chunks, 5)
    }

# Sample Run
if __name__ == "__main__":
    reference_answer = "The Eiffel Tower is located in Paris."
    generated_response = "Eiffel Tower stands in Paris."

    retrieved_chunks = [
        "The Eiffel Tower is a landmark in Paris.",
        "Paris is the capital of France.",
        "The Louvre is also in Paris.",
        "Eiffel Tower was built in 1889.",
        "It is a famous tourist spot."
    ]

    relevant_chunks = [
        "The Eiffel Tower is a landmark in Paris.",
        "Eiffel Tower was built in 1889."
    ]

    scores = full_evaluation(reference_answer, generated_response, retrieved_chunks, relevant_chunks)

    for metric, score in scores.items():
        print(f"{metric}: {score:.4f}" if isinstance(score, float) else f"{metric}: {score}")

---------------------------
# Install dependencies first (if not already installed)
# pip install openai sentence-transformers langchain pymupdf

import fitz  # PyMuPDF for PDF text extraction
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import openai

# Configure OpenAI API Key
openai.api_key = "YOUR_OPENAI_API_KEY"  # Replace with your actual OpenAI API key

### **Step 1: Extract Text from PDF**
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])
    return text

### **Step 2: Split Text Using RecursiveCharacterTextSplitter**
def split_text(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return text_splitter.split_text(text)

### **Step 3: Compute Embeddings for Chunk Retrieval**
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)

    retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
    relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)

    cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)

    print("Cosine Similarity Matrix (rows: retrieved, columns: relevant queries):\n")
    relevant_scores = []
    
    for i, retrieved in enumerate(retrieved_chunks):
        for j, relevant in enumerate(relevant_queries):
            score = cosine_sim_matrix[i][j].item()
            relevant_scores.append((retrieved, relevant, score))
            print(f"Similarity between:\n  Retrieved: \"{retrieved}\"\n  Query    : \"{relevant}\"\n  Score    : {score:.4f}\n")

    # Sort and return the top-K most relevant chunks
    relevant_scores.sort(key=lambda x: x[2], reverse=True)
    return [x[0] for x in relevant_scores[:5]]  # Adjust top-K as needed

### **Step 4: Pass Top-K Chunks to OpenAI LLM**
def query_openai(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",  # You can change this to "gpt-3.5-turbo" or another model
        messages=[{"role": "system", "content": "You are an assistant."},
                  {"role": "user", "content": prompt}]
    )
    return response["choices"][0]["message"]["content"]

### **Final Workflow**
def process_pdf(pdf_path):
    # Step 1: Extract text from PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Step 2: Split text into chunks
    retrieved_chunks = split_text(extracted_text)

    # Step 3: Define relevant queries (modify as per your needs)
    relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
    
    # Step 4: Retrieve top-K relevant chunks
    top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)

    # Step 5: Query OpenAI LLM with relevant chunks
    prompt = f"Summarize this information: {' '.join(top_chunks)}"
    openai_response = query_openai(prompt)

    print("\nOpenAI Response:\n", openai_response)

# Run the pipeline with a sample PDF file
process_pdf("C:\\Users\\YourName\\Documents\\sample.pdf")  # Replace with your actual PDF file path

    ###############################################################################
    ### Step 1: Extract Text from PDF ###
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])
    return text
 
### Step 2: Split Text Using RecursiveCharacterTextSplitter ###
def split_text(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return text_splitter.split_text(text)
 
### Step 3: Compute Embeddings for Chunk Retrieval ###
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4):
    model = SentenceTransformer(model_name)
 
    # Clean up empty strings
    retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()]
    relevant_queries = [q for q in relevant_queries if q.strip()]
 
    # Debug check
    if not retrieved_chunks:
        raise ValueError("retrieved_chunks is empty!")
    if not relevant_queries:
        raise ValueError("relevant_queries is empty!")
 
    # Compute embeddings
    retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
    relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
 
    # Cosine similarity matrix: (retrieved x queries)
    cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)
 
    # Log similarity matrix
    print("Cosine Similarity Matrix (rows: chunks, columns: queries):\n", cosine_sim_matrix)
 
    # Score all pairs
    relevant_scores = []
    for i, retrieved in enumerate(retrieved_chunks):
        for j, relevant in enumerate(relevant_queries):
            score = cosine_sim_matrix[i][j].item()
            relevant_scores.append((retrieved, relevant, score))
 
    # Sort by score descending
    relevant_scores.sort(key=lambda x: x[2], reverse=True)
 
    # Log top matches
    print("\nTop Relevant Chunks and Scores:")
    for r, q, s in relevant_scores[:top_k]:
        print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}")
 
    # Apply threshold
    filtered = [x for x in relevant_scores if x[2] >= similarity_threshold]
    top_filtered = filtered[:top_k]
 
    return [x[0] for x in top_filtered]
 
### Step 4: Pass Top-K Chunks to OpenAI LLM ###
def query_openai(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",  # Or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are an assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response["choices"][0]["message"]["content"]
 
### Final Workflow ###
def process_pdf(pdf_path):
    # Step 1: Extract text
    extracted_text = extract_text_from_pdf(pdf_path)
 
    # Step 2: Split into chunks
    retrieved_chunks = split_text(extracted_text)
 
    # Step 3: Define queries
    relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]
 
    # Step 4: Retrieve top-K relevant chunks
    top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)
 
    # Step 5: Query OpenAI
    prompt = (
        f"You are a historical assistant. Summarize the following content "
        f"in context of the queries: {', '.join(relevant_queries)}.\n\n"
        f"Relevant content:\n{'\n\n'.join(top_chunks)}"
    )
    openai_response = query_openai(prompt)
 
    print("\nOpenAI Response:\n", openai_response)
 
### Run the pipeline on your sample PDF ###
if __name__ == "__main__":
    process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf")



=======================================================================

import fitz
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import openai

# For Windows users: uncomment and set path if needed
# pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

openai.api_key = "your-api-key-here"  # 🔐 Replace with your actual OpenAI API key

### Step 1: Extract Text from PDF or Fallback to OCR ###
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])

    if text.strip():  # If text was extracted from PDF
        print("Text extracted using fitz.")
        return text

    # OCR fallback for scanned/image-based PDFs
    print("No extractable text found. Trying OCR instead...")
    images = convert_from_path(pdf_path)
    ocr_text = "\n".join(pytesseract.image_to_string(image) for image in images)
    return ocr_text

### Step 2: Split Text into Chunks ###
def split_text(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
    return text_splitter.split_text(text)

### Step 3: Get Top-K Relevant Chunks Based on Query ###
def get_relevant_chunks(retrieved_chunks, relevant_queries, model_name='all-MiniLM-L6-v2', top_k=5, similarity_threshold=0.4):
    model = SentenceTransformer(model_name)
    retrieved_chunks = [chunk for chunk in retrieved_chunks if chunk.strip()]
    relevant_queries = [q for q in relevant_queries if q.strip()]

    if not retrieved_chunks:
        raise ValueError("retrieved_chunks is empty!")
    if not relevant_queries:
        raise ValueError("relevant_queries is empty!")

    retrieved_embeddings = model.encode(retrieved_chunks, convert_to_tensor=True)
    relevant_embeddings = model.encode(relevant_queries, convert_to_tensor=True)
    cosine_sim_matrix = util.cos_sim(retrieved_embeddings, relevant_embeddings)

    relevant_scores = []
    for i, retrieved in enumerate(retrieved_chunks):
        for j, relevant in enumerate(relevant_queries):
            score = cosine_sim_matrix[i][j].item()
            relevant_scores.append((retrieved, relevant, score))

    relevant_scores.sort(key=lambda x: x[2], reverse=True)
    filtered = [x for x in relevant_scores if x[2] >= similarity_threshold]
    top_filtered = filtered[:top_k]

    for r, q, s in top_filtered:
        print(f"\nChunk:\n{r[:150]}...\nQuery: {q}\nScore: {s:.4f}")

    return [x[0] for x in top_filtered]

### Step 4: Query OpenAI LLM with Prompt ###
def query_openai(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response["choices"][0]["message"]["content"]

### Final Workflow ###
def process_pdf(pdf_path):
    extracted_text = extract_text_from_pdf(pdf_path)
    print("Extracted text length:", len(extracted_text))

    retrieved_chunks = split_text(extracted_text)
    print("Number of chunks created:", len(retrieved_chunks))

    relevant_queries = ["Eiffel Tower", "Paris landmarks", "French history"]

    top_chunks = get_relevant_chunks(retrieved_chunks, relevant_queries)

    prompt = (
        f"You are a historical assistant. Summarize the following content "
        f"in context of the queries: {', '.join(relevant_queries)}.\n\n"
        f"Relevant content:\n{'\n\n'.join(top_chunks)}"
    )

    openai_response = query_openai(prompt)
    print("\nOpenAI Response:\n", openai_response)

### Run the pipeline ###
if __name__ == "__main__":
    process_pdf(r"C:\Users\shalini\Desktop\Project\abc.pdf")


=======================================
def openai_llm_call(prompt: str, persona: str) -> str:
 
    client = AzureOpenAI(
        api_version=os.getenv("API_VERSION"),
        azure_endpoint=os.getenv("ENDPOINT"),
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": persona,
            },
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model=os.getenv("DEPLOYMENT")
    )
 
    return response.choices[0].message.content