petchutney commited on
Commit
63b8334
·
verified ·
1 Parent(s): 1111e0a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -41
app.py CHANGED
@@ -1,19 +1,17 @@
1
- def process_and_explain(pdf_file, concept_to_explain):
2
- if pdf_file is not None:
3
- extracted_text = extract_text_from_pdf(pdf_file)
4
- print(f"Extracted Text (first 500 chars):\n{extracted_text[:500]}") # Add this line
5
- if "Error reading PDF" in extracted_text or not extracted_text.strip():
6
- return "Could not extract text from the PDF. Please try another file."
7
-
8
  import gradio as gr
9
- from transformers import pipeline
10
  import PyPDF2
 
 
11
  import logging
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
- # Load the explainer model
 
 
 
17
  explainer = pipeline("text2text-generation", model="google/flan-t5-base")
18
 
19
  def extract_text_from_pdf(pdf_file):
@@ -21,52 +19,64 @@ def extract_text_from_pdf(pdf_file):
21
  try:
22
  with open(pdf_file.name, 'rb') as pdfFileObj:
23
  pdfReader = PyPDF2.PdfReader(pdfFileObj)
24
- num_pages = len(pdfReader.pages)
25
- logger.info(f"Number of pages in PDF: {num_pages}")
26
- for pageNum in range(num_pages):
27
  pageObj = pdfReader.pages[pageNum]
28
- page_text = pageObj.extract_text()
29
- text += page_text
30
- logger.info(f"Extracted {len(text)} characters from PDF.")
31
  except Exception as e:
32
  logger.error(f"Error reading PDF: {e}")
33
- return f"Error reading PDF: {e}"
34
  return text
35
 
36
- def process_and_explain(pdf_file, concept_to_explain):
37
- logger.info("Processing request...")
 
 
 
 
 
 
 
 
38
  if pdf_file is not None:
39
  extracted_text = extract_text_from_pdf(pdf_file)
40
- logger.info(f"Extracted text (first 500 chars):\n{extracted_text[:500]}")
41
- if "Error reading PDF" in extracted_text or not extracted_text.strip():
42
- return "Could not extract text from the PDF. Please try another file."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- prompt = f"Explain the concept of '{concept_to_explain}' based on this text: '{extracted_text[:1500]}...' in simple terms."
45
- logger.info(f"Prompting explainer model with: {prompt[:200]}...")
46
- try:
47
- explanation_result = explainer(prompt, max_length=300, do_sample=False)
48
- explanation = explanation_result[0]["generated_text"]
49
- logger.info(f"Explanation generated: {explanation}")
50
- return f"**Extracted Text Snippet:**\n{extracted_text[:500]}...\n\n**Explanation of '{concept_to_explain}':**\n{explanation}"
51
- except Exception as e:
52
- logger.error(f"Error during explanation generation: {e}")
53
- return f"An error occurred while generating the explanation: {e}"
54
  else:
55
- return "Please upload a PDF file."
56
 
57
  with gr.Blocks() as demo:
58
- gr.Markdown("## Explain Concepts from Uploaded PDFs")
59
- gr.Markdown("Upload a PDF, and then specify a concept you'd like explained based on its content.")
60
 
61
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
62
- concept_input = gr.Textbox(label="Concept to Explain", placeholder="E.g., Climate Change")
63
- explain_button = gr.Button("Explain Concept")
64
- output_text = gr.Markdown(label="Explanation")
65
 
66
- explain_button.click(
67
- fn=process_and_explain,
68
- inputs=[pdf_input, concept_input],
69
- outputs=output_text
70
  )
71
 
72
  demo.launch()
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
3
  import PyPDF2
4
+ from sentence_transformers import SentenceTransformer
5
+ import torch
6
  import logging
7
 
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger(__name__)
10
 
11
+ # --- Load Models and Tokenizers ---
12
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
13
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
14
+ qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
15
  explainer = pipeline("text2text-generation", model="google/flan-t5-base")
16
 
17
  def extract_text_from_pdf(pdf_file):
 
19
  try:
20
  with open(pdf_file.name, 'rb') as pdfFileObj:
21
  pdfReader = PyPDF2.PdfReader(pdfFileObj)
22
+ for pageNum in range(len(pdfReader.pages)):
 
 
23
  pageObj = pdfReader.pages[pageNum]
24
+ text += pageObj.extract_text()
 
 
25
  except Exception as e:
26
  logger.error(f"Error reading PDF: {e}")
27
+ return None
28
  return text
29
 
30
+ def chunk_text(text, chunk_size=500, chunk_overlap=50):
31
+ chunks = []
32
+ start = 0
33
+ while start < len(text):
34
+ end = min(start + chunk_size, len(text))
35
+ chunks.append(text[start:end])
36
+ start += chunk_size - chunk_overlap
37
+ return chunks
38
+
39
+ def process_and_answer(pdf_file, question):
40
  if pdf_file is not None:
41
  extracted_text = extract_text_from_pdf(pdf_file)
42
+ if not extracted_text:
43
+ return "Could not extract text from the PDF."
44
+
45
+ text_chunks = chunk_text(extracted_text)
46
+ embeddings = embedding_model.encode(text_chunks)
47
+
48
+ question_embedding = embedding_model.encode(question)
49
+
50
+ # Simple similarity search (you can use a more efficient method for larger documents)
51
+ import numpy as np
52
+ similarities = np.inner(question_embedding, embeddings)
53
+ most_relevant_chunk_index = np.argmax(similarities)
54
+ context = text_chunks[most_relevant_chunk_index]
55
+
56
+ inputs = tokenizer(question, context, return_tensors="pt", truncation="only", max_length=512)
57
+ with torch.no_grad():
58
+ outputs = qa_model(**inputs)
59
+ answer_start_index = torch.argmax(outputs.start_logits)
60
+ answer_end_index = torch.argmax(outputs.end_logits) + 1
61
+ answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start_index:answer_end_index]))
62
 
63
+ return answer.strip() if answer.strip() else "Could not find an answer in the document."
 
 
 
 
 
 
 
 
 
64
  else:
65
+ return "Please upload a PDF file and ask a question."
66
 
67
  with gr.Blocks() as demo:
68
+ gr.Markdown("## Ask Questions About Your Documents")
69
+ gr.Markdown("Upload a PDF and ask specific questions about its content.")
70
 
71
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
72
+ question_input = gr.Textbox(label="Your Question", placeholder="E.g., Who is the author of this book?")
73
+ answer_button = gr.Button("Find Answer")
74
+ output_answer = gr.Textbox(label="Answer")
75
 
76
+ answer_button.click(
77
+ fn=process_and_answer,
78
+ inputs=[pdf_input, question_input],
79
+ outputs=output_answer
80
  )
81
 
82
  demo.launch()