Spaces:

arif670
/

O_Level_exam

Sleeping

App Files Files Community

arif670 commited on Jan 12, 2025

Commit

0fe6418

verified ·

1 Parent(s): a9bce3d

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -105

app.py CHANGED Viewed

@@ -1,115 +1,64 @@
 import streamlit as st
-import requests
 from bs4 import BeautifulSoup
-from duckduckgo_search import ddg
 import pandas as pd
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.chrome.service import Service
-from webdriver_manager.chrome import ChromeDriverManager
-# Function to search the web for relevant URLs using DuckDuckGo
-def search_web(subject_code, exam_year, variant, session):
-    query = f"Cambridge O-Level {subject_code} {exam_year} variant {variant} {session} questions and answers"
-    st.write(f"Performing DuckDuckGo Search for: {query}")
-    results = ddg(query, max_results=5)
-    urls = [result["href"] for result in results]
-    return urls
-# Function to scrape questions and answers using requests
-def scrape_with_requests(url):
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    }
-    try:
-        response = requests.get(url, headers=headers)
-        if response.status_code == 200:
-            soup = BeautifulSoup(response.text, "html.parser")
-            questions = [q.text.strip() for q in soup.select(".question-class")]
-            answers = [a.text.strip() for a in soup.select(".answer-class")]
-            if len(questions) != len(answers):
-                st.warning(f"Data inconsistency on {url}. Skipping this source.")
-                return None
-            return pd.DataFrame({"Question": questions, "Answer": answers})
-        else:
-            st.warning(f"Failed to access {url}. HTTP Status Code: {response.status_code}")
-            return None
-    except Exception as e:
-        st.error(f"Error scraping {url}: {e}")
-        return None
-# Function to scrape questions and answers using Selenium
-def scrape_with_selenium(url):
-    options = webdriver.ChromeOptions()
-    options.add_argument("--headless")
-    options.add_argument("--disable-gpu")
-    options.add_argument(
-        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    )
-    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
-    driver.get(url)
-    try:
-        questions = [q.text for q in driver.find_elements(By.CSS_SELECTOR, ".question-class")]
-        answers = [a.text for a in driver.find_elements(By.CSS_SELECTOR, ".answer-class")]
-        if len(questions) != len(answers):
-            st.warning(f"Data inconsistency on {url}. Skipping this source.")
-            return None
-        return pd.DataFrame({"Question": questions, "Answer": answers})
-    except Exception as e:
-        st.error(f"Error scraping {url} with Selenium: {e}")
-        return None
-    finally:
-        driver.quit()
-# Streamlit app
 def main():
-    st.title("Cambridge O-Level Q&A Extractor")
-    st.sidebar.header("Input Parameters")
-    subject_code = st.sidebar.text_input("Subject Code", "0610")
-    exam_year = st.sidebar.text_input("Exam Year", "2023")
-    variant = st.sidebar.text_input("Variant", "1")
-    session = st.sidebar.selectbox("Session", ["May-Jun", "Oct-Nov"])
-    if st.sidebar.button("Fetch Questions & Answers"):
-        st.write("Searching the internet for relevant sources...")
-        urls = search_web(subject_code, exam_year, variant, session)
-        if urls:
-            st.write(f"Found {len(urls)} potential sources. Scraping data...")
-            all_data = []
-            for url in urls:
-                st.write(f"Processing: {url}")
-                data = scrape_with_requests(url)
-                if data is None:
-                    st.write(f"Falling back to Selenium for {url}...")
-                    data = scrape_with_selenium(url)
-                if data is not None:
-                    all_data.append(data)
-            if all_data:
-                final_data = pd.concat(all_data, ignore_index=True)
-                st.subheader("Questions and Answers")
-                for _, row in final_data.iterrows():
-                    st.write(f"**Q: {row['Question']}**")
-                    st.write(f"A: {row['Answer']}")
             else:
-                st.warning("No data could be retrieved from the identified sources.")
         else:
-            st.warning("No relevant sources found on the internet.")
 if __name__ == "__main__":
-    main()

 import streamlit as st
 from bs4 import BeautifulSoup
+import requests
+from transformers import pipeline
 import pandas as pd
+# Load pre-trained question-answering model (replace with a suitable model)
+qa_model = pipeline("question-answering")
+@st.cache(allow_output_mutation=True)
+def fetch_past_papers(subject_code, exam_year, variant, session):
+    """Fetches past papers from the Cambridge Assessment International Education website."""
+    url = f"https://www.cambridgeinternational.org/programmes-and-qualifications/cambridge-o-level/{subject_code}/past-papers-and-mark-schemes/"
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, "html.parser")
+    # Extract relevant past paper based on exam year, variant, and session
+    past_papers = soup.find_all("a", href=lambda href: href and href.startswith(f"/programmes-and-qualifications/cambridge-o-level/{subject_code}/past-papers-and-mark-schemes/{exam_year}/{session}/"))
+    for paper in past_papers:
+        if paper.text.strip() == f"{variant} Paper {variant}":
+            return paper["href"]
+    return None
+def extract_questions_and_answers(past_paper_url):
+    """Extracts questions and answers from the past paper PDF using a combination of OCR and question-answering."""
+    # Replace with a suitable OCR library (e.g., PyMuPDF, Tesseract) and ensure it's installed
+    # This example demonstrates the overall approach, assuming OCR functionality
+    # Replace the placeholder code with actual OCR processing
+    # ocr_result = ocr_process(past_paper_url)  # Replace with your OCR implementation
+    # Process the extracted text using the question-answering model
+    questions_and_answers = []
+    for paragraph in ocr_result.split("\n\n"):
+        question = qa_model.question_answering(paragraph, question="What is the question?")["question"]
+        if question:
+            answer = qa_model.question_answering(paragraph, question=question)["answer"]
+            questions_and_answers.append({"question": question, "answer": answer})
+    return questions_and_answers
 def main():
+    """Streamlit app to interact with the user and display results."""
+    st.title("Cambridge O-Level Exam Q&A Extractor")
+    subject_code = st.text_input("Subject Code")
+    exam_year = st.selectbox("Exam Year", [str(year) for year in range(2015, 2026)])
+    variant = st.selectbox("Variant", ["1", "2", "3"])
+    session = st.selectbox("Session", ["May-Jun", "Oct-Nov"])
+    if st.button("Search"):
+        past_paper_url = fetch_past_papers(subject_code, exam_year, variant, session)
+        if past_paper_url:
+            questions_and_answers = extract_questions_and_answers(past_paper_url)
+            if questions_and_answers:
+                df = pd.DataFrame(questions_and_answers)
+                st.dataframe(df)
             else:
+                st.error("No questions and answers found in the extracted text.")
         else:
+            st.error("Past paper not found for the specified criteria.")
 if __name__ == "__main__":
+    main()