adityagofi commited on
Commit
82e5ffa
·
verified ·
1 Parent(s): 8abbca0

Update title and Top k related query

Browse files
Files changed (1) hide show
  1. app.py +78 -79
app.py CHANGED
@@ -1,80 +1,79 @@
1
- import PyPDF2
2
- import numpy as np
3
- import faiss
4
- from transformers import BertTokenizer, BertForMaskedLM, BertForQuestionAnswering
5
- import torch
6
-
7
- tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
8
- qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
9
- from transformers import AutoTokenizer, AutoModelForCausalLM
10
- gen_model_id = "distilgpt2"
11
- gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id)
12
- gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id).to("cuda" if torch.cuda.is_available() else "cpu")
13
-
14
- def read_pdf(file):
15
- reader = PyPDF2.PdfReader(file)
16
- text = ""
17
- for page in reader.pages:
18
- text += page.extract_text()
19
- return text
20
-
21
- def split_text(text, chunk_size=500):
22
- return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
23
-
24
- def encode_text(text):
25
- inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
26
- with torch.no_grad():
27
- outputs = qa_model.bert(**inputs)
28
- return outputs.last_hidden_state.mean(dim=1).numpy().astype(np.float32)
29
-
30
- def create_faiss_index(chunks):
31
- embeddings = np.vstack([encode_text(chunk) for chunk in chunks])
32
- index = faiss.IndexFlatL2(embeddings.shape[1])
33
- index.add(embeddings)
34
- return index, embeddings
35
-
36
-
37
- def search_faq(query, index, k=3):
38
- query_emb = encode_text(query)
39
- D, I = index.search(query_emb, k)
40
- return I
41
-
42
- def generate_distilgpt2_answer(context, question):
43
- prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
44
- inputs = gen_tokenizer(prompt, return_tensors="pt").to(gen_model.device)
45
- with torch.no_grad():
46
- outputs = gen_model.generate(**inputs, max_new_tokens=64,
47
- pad_token_id=gen_tokenizer.eos_token_id,
48
- eos_token_id=gen_tokenizer.eos_token_id,
49
- repetition_penalty=1.3)
50
- generated = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
51
- return generated.split("Answer:")[-1].strip()
52
-
53
- def process_pdf_and_answer(pdf_file, query, top_k=1):
54
- text = read_pdf(pdf_file)
55
- chunks = split_text(text)
56
- faiss_index, _ = create_faiss_index(chunks)
57
- indices = search_faq(query, faiss_index, k=top_k)
58
-
59
- answers = []
60
- for idx in indices[0]:
61
- context = chunks[idx]
62
- answer = generate_distilgpt2_answer(context, query)
63
- answers.append(answer)
64
-
65
- return "\n\n---\n\n".join(answers)
66
-
67
- import gradio as gr
68
- interface = gr.Interface(
69
- fn=process_pdf_and_answer,
70
- inputs=[
71
- gr.File(label="Upload PDF"),
72
- gr.Textbox(label="Your Question"),
73
- gr.Slider(1, 5, step=1, label="Top K Results", value=1)
74
- ],
75
- outputs=gr.Textbox(label="Generated Answer(s)"),
76
- title="📄 PDF Quwstion Answering",
77
- description="Upload a PDF and ask a question about its content. The model will try to answer based on the most relevant chunks.",
78
- )
79
-
80
  interface.launch()
 
1
+ import PyPDF2
2
+ import numpy as np
3
+ import faiss
4
+ from transformers import BertTokenizer, BertForMaskedLM, BertForQuestionAnswering
5
+ import torch
6
+
7
+ tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
8
+ qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
+ gen_model_id = "distilgpt2"
11
+ gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id)
12
+ gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id).to("cuda" if torch.cuda.is_available() else "cpu")
13
+
14
+ def read_pdf(file):
15
+ reader = PyPDF2.PdfReader(file)
16
+ text = ""
17
+ for page in reader.pages:
18
+ text += page.extract_text()
19
+ return text
20
+
21
+ def split_text(text, chunk_size=500):
22
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
23
+
24
+ def encode_text(text):
25
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
26
+ with torch.no_grad():
27
+ outputs = qa_model.bert(**inputs)
28
+ return outputs.last_hidden_state.mean(dim=1).numpy().astype(np.float32)
29
+
30
+ def create_faiss_index(chunks):
31
+ embeddings = np.vstack([encode_text(chunk) for chunk in chunks])
32
+ index = faiss.IndexFlatL2(embeddings.shape[1])
33
+ index.add(embeddings)
34
+ return index, embeddings
35
+
36
+
37
+ def search_faq(query, index, k=3):
38
+ query_emb = encode_text(query)
39
+ D, I = index.search(query_emb, k)
40
+ return I
41
+
42
+ def generate_distilgpt2_answer(context, question):
43
+ prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
44
+ inputs = gen_tokenizer(prompt, return_tensors="pt").to(gen_model.device)
45
+ with torch.no_grad():
46
+ outputs = gen_model.generate(**inputs, max_new_tokens=64,
47
+ pad_token_id=gen_tokenizer.eos_token_id,
48
+ eos_token_id=gen_tokenizer.eos_token_id,
49
+ repetition_penalty=1.3)
50
+ generated = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
51
+ return generated.split("Answer:")[-1].strip()
52
+
53
+ def process_pdf_and_answer(pdf_file, query, top_k=1):
54
+ text = read_pdf(pdf_file)
55
+ chunks = split_text(text)
56
+ faiss_index, _ = create_faiss_index(chunks)
57
+ indices = search_faq(query, faiss_index, k=top_k)
58
+
59
+ answers = []
60
+ for idx in indices[0]:
61
+ context = chunks[idx]
62
+ answer = generate_distilgpt2_answer(context, query)
63
+ answers.append(answer)
64
+
65
+ return "\n\n---\n\n".join(answers)
66
+
67
+ import gradio as gr
68
+ interface = gr.Interface(
69
+ fn=process_pdf_and_answer,
70
+ inputs=[
71
+ gr.File(label="Upload PDF"),
72
+ gr.Textbox(label="Your Question"),
73
+ ],
74
+ outputs=gr.Textbox(label="Generated Answer(s)"),
75
+ title="📄 PDF Question Answering",
76
+ description="Upload a PDF and ask a question about its content. The model will try to answer based on the most relevant chunks.",
77
+ )
78
+
 
79
  interface.launch()