Spaces:
Runtime error
Runtime error
| import json | |
| import numpy as np | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration | |
| from PyPDF2 import PdfReader | |
| import spacy | |
| # Load SpaCy and models | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| from spacy.cli import download | |
| download("en_core_web_sm") | |
| nlp = spacy.load("en_core_web_sm") | |
| embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| tokenizer = T5Tokenizer.from_pretrained("./T5base_Question_Generation") | |
| t5_model = T5ForConditionalGeneration.from_pretrained("./T5base_Question_Generation") | |
| def extract_text_from_pdf(pdf_path): | |
| reader = PdfReader(pdf_path) | |
| text = "" | |
| for page in reader.pages: | |
| if page.extract_text(): | |
| text += page.extract_text() + "\n" | |
| return text | |
| def split_into_sentences(text): | |
| doc = nlp(text) | |
| return [sent.text.strip() for sent in doc.sents if sent.text.strip()] | |
| def create_chunks(sentences, window_size=2): | |
| return [" ".join(sentences[i:i+window_size]) for i in range(len(sentences) - window_size + 1)] | |
| def generate_embeddings(chunks): | |
| return embedding_model.encode(chunks, show_progress_bar=True) | |
| def create_faiss_index(embeddings): | |
| dimension = embeddings[0].shape[0] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(np.array(embeddings)) | |
| return index | |
| def retrieve_relevant_chunks(query, chunks, index, top_k=30): | |
| query_embedding = embedding_model.encode([query]) | |
| distances, indices = index.search(np.array(query_embedding), top_k) | |
| return [chunks[i] for i in indices[0]], distances[0] | |
| def get_questions(tag, difficulty, context, num_questions=3, max_length=150): | |
| input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99> {context}" | |
| features = tokenizer([input_text], return_tensors='pt') | |
| output = t5_model.generate( | |
| input_ids=features['input_ids'], | |
| attention_mask=features['attention_mask'], | |
| max_length=max_length, | |
| num_return_sequences=num_questions, | |
| do_sample=True, | |
| top_p=0.95, | |
| top_k=50 | |
| ) | |
| return [tokenizer.decode(out, skip_special_tokens=True) for out in output] | |
| def process_pdf(pdf_file, tag, difficulty, query): | |
| if pdf_file is None: | |
| return "Please upload a PDF file." | |
| text = extract_text_from_pdf(pdf_file.name) | |
| sentences = split_into_sentences(text) | |
| chunks = create_chunks(sentences) | |
| embeddings = generate_embeddings(chunks) | |
| index = create_faiss_index(embeddings) | |
| relevant_chunks, _ = retrieve_relevant_chunks(query, chunks, index) | |
| filtered_chunks = [chunk for chunk in relevant_chunks if len(chunk.split()) > 20][:3] | |
| if not filtered_chunks: | |
| return "No sufficiently long chunks found. Try another query." | |
| context = " ".join(filtered_chunks) | |
| questions = get_questions(tag, difficulty, context) | |
| return "\n".join([f"Question {i+1}: {q}" for i, q in enumerate(questions)]) | |