saranya19b commited on
Commit
3f7f9d9
Β·
verified Β·
1 Parent(s): eb0d386

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +28 -14
  2. main.py +30 -0
  3. ocr.py +8 -0
  4. rag_model.py +15 -0
  5. requirements.txt +4 -0
README.md CHANGED
@@ -1,14 +1,28 @@
1
- ---
2
- title: Studymate
3
- emoji: 🏒
4
- colorFrom: red
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.39.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: questions and answers based on pdfs uploaded
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # StudyMate: AI-Powered PDF-Based Q&A System
2
+
3
+ StudyMate allows students to upload a PDF and ask questions using a powerful RAG-based AI model.
4
+
5
+ ### πŸ’‘ Features
6
+ - πŸ“„ PDF Upload & Parsing
7
+ - πŸ” OCR and Content Extraction
8
+ - πŸ€– RAG-based Q&A using Hugging Face
9
+ - 🌐 Built with Streamlit, deployable on Hugging Face Spaces
10
+
11
+ ### πŸš€ How to Run
12
+
13
+ ```bash
14
+ pip install -r requirements.txt
15
+ streamlit run main.py
16
+ ```
17
+
18
+ ### πŸ“ Folder Structure
19
+
20
+ ```
21
+ studymate/
22
+ β”œβ”€β”€ main.py
23
+ β”œβ”€β”€ requirements.txt
24
+ β”œβ”€β”€ README.md
25
+ └── backend/
26
+ β”œβ”€β”€ ocr.py
27
+ └── rag_model.py
28
+ ```
main.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from backend.ocr import extract_text_from_pdf
3
+ from backend.rag_model import setup_retriever_and_qa, get_answer
4
+
5
+ st.set_page_config(page_title="StudyMate - PDF Q&A", layout="wide")
6
+ st.title("πŸ“š StudyMate: AI-Powered PDF-Based Q&A System")
7
+
8
+ # Initialize retriever and RAG model
9
+ retriever, rag_pipeline = setup_retriever_and_qa()
10
+
11
+ uploaded_file = st.file_uploader("Upload your study material (PDF)", type="pdf")
12
+
13
+ if uploaded_file:
14
+ with st.spinner("Extracting content from PDF..."):
15
+ full_text = extract_text_from_pdf(uploaded_file)
16
+ st.success("PDF content extracted!")
17
+
18
+ if full_text:
19
+ st.text_area("πŸ“„ Extracted Text Preview", full_text[:2000], height=300)
20
+
21
+ query = st.text_input("πŸ’¬ Ask a question based on the PDF")
22
+
23
+ if query:
24
+ with st.spinner("Thinking..."):
25
+ answer = get_answer(full_text, query, retriever, rag_pipeline)
26
+ st.markdown(f"**🧠 Answer:** {answer}")
27
+ else:
28
+ st.error("Failed to extract text from the PDF.")
29
+ else:
30
+ st.info("Please upload a PDF to get started.")
ocr.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+
3
+ def extract_text_from_pdf(file) -> str:
4
+ doc = fitz.open(stream=file.read(), filetype="pdf")
5
+ text = ""
6
+ for page in doc:
7
+ text += page.get_text()
8
+ return text.strip()
rag_model.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
2
+ from transformers import pipeline
3
+ import torch
4
+
5
+ def setup_retriever_and_qa():
6
+ tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
7
+ retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
8
+ rag_model = RagTokenForGeneration.from_pretrained("facebook/rag-token-base")
9
+ qa_pipeline = pipeline("text2text-generation", model=rag_model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
10
+ return retriever, qa_pipeline
11
+
12
+ def get_answer(context: str, question: str, retriever, qa_pipeline):
13
+ input_text = f"question: {question} context: {context}"
14
+ result = qa_pipeline(input_text, max_length=200, do_sample=True)
15
+ return result[0]['generated_text']
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ torch
4
+ pymupdf