arif670 commited on
Commit
0fe6418
·
verified ·
1 Parent(s): a9bce3d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -105
app.py CHANGED
@@ -1,115 +1,64 @@
1
  import streamlit as st
2
- import requests
3
  from bs4 import BeautifulSoup
4
- from duckduckgo_search import ddg
 
5
  import pandas as pd
6
- from selenium import webdriver
7
- from selenium.webdriver.common.by import By
8
- from selenium.webdriver.chrome.service import Service
9
- from webdriver_manager.chrome import ChromeDriverManager
10
-
11
-
12
- # Function to search the web for relevant URLs using DuckDuckGo
13
- def search_web(subject_code, exam_year, variant, session):
14
- query = f"Cambridge O-Level {subject_code} {exam_year} variant {variant} {session} questions and answers"
15
- st.write(f"Performing DuckDuckGo Search for: {query}")
16
- results = ddg(query, max_results=5)
17
- urls = [result["href"] for result in results]
18
- return urls
19
-
20
-
21
- # Function to scrape questions and answers using requests
22
- def scrape_with_requests(url):
23
- headers = {
24
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
25
- }
26
- try:
27
- response = requests.get(url, headers=headers)
28
- if response.status_code == 200:
29
- soup = BeautifulSoup(response.text, "html.parser")
30
- questions = [q.text.strip() for q in soup.select(".question-class")]
31
- answers = [a.text.strip() for a in soup.select(".answer-class")]
32
-
33
- if len(questions) != len(answers):
34
- st.warning(f"Data inconsistency on {url}. Skipping this source.")
35
- return None
36
-
37
- return pd.DataFrame({"Question": questions, "Answer": answers})
38
- else:
39
- st.warning(f"Failed to access {url}. HTTP Status Code: {response.status_code}")
40
- return None
41
- except Exception as e:
42
- st.error(f"Error scraping {url}: {e}")
43
- return None
44
-
45
-
46
- # Function to scrape questions and answers using Selenium
47
- def scrape_with_selenium(url):
48
- options = webdriver.ChromeOptions()
49
- options.add_argument("--headless")
50
- options.add_argument("--disable-gpu")
51
- options.add_argument(
52
- "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
53
- )
54
 
55
- driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
56
- driver.get(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- try:
59
- questions = [q.text for q in driver.find_elements(By.CSS_SELECTOR, ".question-class")]
60
- answers = [a.text for a in driver.find_elements(By.CSS_SELECTOR, ".answer-class")]
61
-
62
- if len(questions) != len(answers):
63
- st.warning(f"Data inconsistency on {url}. Skipping this source.")
64
- return None
65
-
66
- return pd.DataFrame({"Question": questions, "Answer": answers})
67
- except Exception as e:
68
- st.error(f"Error scraping {url} with Selenium: {e}")
69
- return None
70
- finally:
71
- driver.quit()
72
-
73
-
74
- # Streamlit app
75
  def main():
76
- st.title("Cambridge O-Level Q&A Extractor")
77
- st.sidebar.header("Input Parameters")
78
- subject_code = st.sidebar.text_input("Subject Code", "0610")
79
- exam_year = st.sidebar.text_input("Exam Year", "2023")
80
- variant = st.sidebar.text_input("Variant", "1")
81
- session = st.sidebar.selectbox("Session", ["May-Jun", "Oct-Nov"])
82
-
83
- if st.sidebar.button("Fetch Questions & Answers"):
84
- st.write("Searching the internet for relevant sources...")
85
- urls = search_web(subject_code, exam_year, variant, session)
86
-
87
- if urls:
88
- st.write(f"Found {len(urls)} potential sources. Scraping data...")
89
- all_data = []
90
-
91
- for url in urls:
92
- st.write(f"Processing: {url}")
93
- data = scrape_with_requests(url)
94
- if data is None:
95
- st.write(f"Falling back to Selenium for {url}...")
96
- data = scrape_with_selenium(url)
97
-
98
- if data is not None:
99
- all_data.append(data)
100
-
101
- if all_data:
102
- final_data = pd.concat(all_data, ignore_index=True)
103
- st.subheader("Questions and Answers")
104
- for _, row in final_data.iterrows():
105
- st.write(f"**Q: {row['Question']}**")
106
- st.write(f"A: {row['Answer']}")
107
  else:
108
- st.warning("No data could be retrieved from the identified sources.")
109
  else:
110
- st.warning("No relevant sources found on the internet.")
111
-
112
 
113
  if __name__ == "__main__":
114
- main()
115
-
 
1
  import streamlit as st
 
2
  from bs4 import BeautifulSoup
3
+ import requests
4
+ from transformers import pipeline
5
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # Load pre-trained question-answering model (replace with a suitable model)
8
+ qa_model = pipeline("question-answering")
9
+
10
+ @st.cache(allow_output_mutation=True)
11
+ def fetch_past_papers(subject_code, exam_year, variant, session):
12
+ """Fetches past papers from the Cambridge Assessment International Education website."""
13
+ url = f"https://www.cambridgeinternational.org/programmes-and-qualifications/cambridge-o-level/{subject_code}/past-papers-and-mark-schemes/"
14
+ response = requests.get(url)
15
+ soup = BeautifulSoup(response.content, "html.parser")
16
+
17
+ # Extract relevant past paper based on exam year, variant, and session
18
+ past_papers = soup.find_all("a", href=lambda href: href and href.startswith(f"/programmes-and-qualifications/cambridge-o-level/{subject_code}/past-papers-and-mark-schemes/{exam_year}/{session}/"))
19
+ for paper in past_papers:
20
+ if paper.text.strip() == f"{variant} Paper {variant}":
21
+ return paper["href"]
22
+
23
+ return None
24
+
25
+ def extract_questions_and_answers(past_paper_url):
26
+ """Extracts questions and answers from the past paper PDF using a combination of OCR and question-answering."""
27
+ # Replace with a suitable OCR library (e.g., PyMuPDF, Tesseract) and ensure it's installed
28
+ # This example demonstrates the overall approach, assuming OCR functionality
29
+ # Replace the placeholder code with actual OCR processing
30
+ # ocr_result = ocr_process(past_paper_url) # Replace with your OCR implementation
31
+
32
+ # Process the extracted text using the question-answering model
33
+ questions_and_answers = []
34
+ for paragraph in ocr_result.split("\n\n"):
35
+ question = qa_model.question_answering(paragraph, question="What is the question?")["question"]
36
+ if question:
37
+ answer = qa_model.question_answering(paragraph, question=question)["answer"]
38
+ questions_and_answers.append({"question": question, "answer": answer})
39
+
40
+ return questions_and_answers
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def main():
43
+ """Streamlit app to interact with the user and display results."""
44
+ st.title("Cambridge O-Level Exam Q&A Extractor")
45
+
46
+ subject_code = st.text_input("Subject Code")
47
+ exam_year = st.selectbox("Exam Year", [str(year) for year in range(2015, 2026)])
48
+ variant = st.selectbox("Variant", ["1", "2", "3"])
49
+ session = st.selectbox("Session", ["May-Jun", "Oct-Nov"])
50
+
51
+ if st.button("Search"):
52
+ past_paper_url = fetch_past_papers(subject_code, exam_year, variant, session)
53
+ if past_paper_url:
54
+ questions_and_answers = extract_questions_and_answers(past_paper_url)
55
+ if questions_and_answers:
56
+ df = pd.DataFrame(questions_and_answers)
57
+ st.dataframe(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  else:
59
+ st.error("No questions and answers found in the extracted text.")
60
  else:
61
+ st.error("Past paper not found for the specified criteria.")
 
62
 
63
  if __name__ == "__main__":
64
+ main()