Avinash250325 commited on
Commit
a1482f3
·
verified ·
1 Parent(s): 99df86b

Create backend.py

Browse files
Files changed (1) hide show
  1. backend.py +77 -0
backend.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import faiss
4
+ from sentence_transformers import SentenceTransformer
5
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
6
+ from PyPDF2 import PdfReader
7
+ import spacy
8
+
9
+ # Load SpaCy and models
10
+ nlp = spacy.load("en_core_web_sm")
11
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
12
+
13
+ tokenizer = T5Tokenizer.from_pretrained("./T5base_Question_Generation")
14
+ t5_model = T5ForConditionalGeneration.from_pretrained("./T5base_Question_Generation")
15
+
16
+ def extract_text_from_pdf(pdf_path):
17
+ reader = PdfReader(pdf_path)
18
+ text = ""
19
+ for page in reader.pages:
20
+ if page.extract_text():
21
+ text += page.extract_text() + "\n"
22
+ return text
23
+
24
+ def split_into_sentences(text):
25
+ doc = nlp(text)
26
+ return [sent.text.strip() for sent in doc.sents if sent.text.strip()]
27
+
28
+ def create_chunks(sentences, window_size=2):
29
+ return [" ".join(sentences[i:i+window_size]) for i in range(len(sentences) - window_size + 1)]
30
+
31
+ def generate_embeddings(chunks):
32
+ return embedding_model.encode(chunks, show_progress_bar=True)
33
+
34
+ def create_faiss_index(embeddings):
35
+ dimension = embeddings[0].shape[0]
36
+ index = faiss.IndexFlatL2(dimension)
37
+ index.add(np.array(embeddings))
38
+ return index
39
+
40
+ def retrieve_relevant_chunks(query, chunks, index, top_k=30):
41
+ query_embedding = embedding_model.encode([query])
42
+ distances, indices = index.search(np.array(query_embedding), top_k)
43
+ return [chunks[i] for i in indices[0]], distances[0]
44
+
45
+ def get_questions(tag, difficulty, context, num_questions=3, max_length=150):
46
+ input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99> {context}"
47
+ features = tokenizer([input_text], return_tensors='pt')
48
+ output = t5_model.generate(
49
+ input_ids=features['input_ids'],
50
+ attention_mask=features['attention_mask'],
51
+ max_length=max_length,
52
+ num_return_sequences=num_questions,
53
+ do_sample=True,
54
+ top_p=0.95,
55
+ top_k=50
56
+ )
57
+ return [tokenizer.decode(out, skip_special_tokens=True) for out in output]
58
+
59
+ def process_pdf(pdf_file, tag, difficulty, query):
60
+ if pdf_file is None:
61
+ return "Please upload a PDF file."
62
+
63
+ text = extract_text_from_pdf(pdf_file.name)
64
+ sentences = split_into_sentences(text)
65
+ chunks = create_chunks(sentences)
66
+ embeddings = generate_embeddings(chunks)
67
+ index = create_faiss_index(embeddings)
68
+ relevant_chunks, _ = retrieve_relevant_chunks(query, chunks, index)
69
+
70
+ filtered_chunks = [chunk for chunk in relevant_chunks if len(chunk.split()) > 20][:3]
71
+
72
+ if not filtered_chunks:
73
+ return "No sufficiently long chunks found. Try another query."
74
+
75
+ context = " ".join(filtered_chunks)
76
+ questions = get_questions(tag, difficulty, context)
77
+ return "\n".join([f"Question {i+1}: {q}" for i, q in enumerate(questions)])