Spaces:
Configuration error
Configuration error
Upload 5 files
Browse files- README.md +28 -14
- main.py +30 -0
- ocr.py +8 -0
- rag_model.py +15 -0
- requirements.txt +4 -0
README.md
CHANGED
|
@@ -1,14 +1,28 @@
|
|
| 1 |
-
--
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# StudyMate: AI-Powered PDF-Based Q&A System
|
| 2 |
+
|
| 3 |
+
StudyMate allows students to upload a PDF and ask questions using a powerful RAG-based AI model.
|
| 4 |
+
|
| 5 |
+
### π‘ Features
|
| 6 |
+
- π PDF Upload & Parsing
|
| 7 |
+
- π OCR and Content Extraction
|
| 8 |
+
- π€ RAG-based Q&A using Hugging Face
|
| 9 |
+
- π Built with Streamlit, deployable on Hugging Face Spaces
|
| 10 |
+
|
| 11 |
+
### π How to Run
|
| 12 |
+
|
| 13 |
+
```bash
|
| 14 |
+
pip install -r requirements.txt
|
| 15 |
+
streamlit run main.py
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
### π Folder Structure
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
studymate/
|
| 22 |
+
βββ main.py
|
| 23 |
+
βββ requirements.txt
|
| 24 |
+
βββ README.md
|
| 25 |
+
βββ backend/
|
| 26 |
+
βββ ocr.py
|
| 27 |
+
βββ rag_model.py
|
| 28 |
+
```
|
main.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from backend.ocr import extract_text_from_pdf
|
| 3 |
+
from backend.rag_model import setup_retriever_and_qa, get_answer
|
| 4 |
+
|
| 5 |
+
st.set_page_config(page_title="StudyMate - PDF Q&A", layout="wide")
|
| 6 |
+
st.title("π StudyMate: AI-Powered PDF-Based Q&A System")
|
| 7 |
+
|
| 8 |
+
# Initialize retriever and RAG model
|
| 9 |
+
retriever, rag_pipeline = setup_retriever_and_qa()
|
| 10 |
+
|
| 11 |
+
uploaded_file = st.file_uploader("Upload your study material (PDF)", type="pdf")
|
| 12 |
+
|
| 13 |
+
if uploaded_file:
|
| 14 |
+
with st.spinner("Extracting content from PDF..."):
|
| 15 |
+
full_text = extract_text_from_pdf(uploaded_file)
|
| 16 |
+
st.success("PDF content extracted!")
|
| 17 |
+
|
| 18 |
+
if full_text:
|
| 19 |
+
st.text_area("π Extracted Text Preview", full_text[:2000], height=300)
|
| 20 |
+
|
| 21 |
+
query = st.text_input("π¬ Ask a question based on the PDF")
|
| 22 |
+
|
| 23 |
+
if query:
|
| 24 |
+
with st.spinner("Thinking..."):
|
| 25 |
+
answer = get_answer(full_text, query, retriever, rag_pipeline)
|
| 26 |
+
st.markdown(f"**π§ Answer:** {answer}")
|
| 27 |
+
else:
|
| 28 |
+
st.error("Failed to extract text from the PDF.")
|
| 29 |
+
else:
|
| 30 |
+
st.info("Please upload a PDF to get started.")
|
ocr.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
|
| 3 |
+
def extract_text_from_pdf(file) -> str:
|
| 4 |
+
doc = fitz.open(stream=file.read(), filetype="pdf")
|
| 5 |
+
text = ""
|
| 6 |
+
for page in doc:
|
| 7 |
+
text += page.get_text()
|
| 8 |
+
return text.strip()
|
rag_model.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
def setup_retriever_and_qa():
|
| 6 |
+
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
|
| 7 |
+
retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
|
| 8 |
+
rag_model = RagTokenForGeneration.from_pretrained("facebook/rag-token-base")
|
| 9 |
+
qa_pipeline = pipeline("text2text-generation", model=rag_model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
|
| 10 |
+
return retriever, qa_pipeline
|
| 11 |
+
|
| 12 |
+
def get_answer(context: str, question: str, retriever, qa_pipeline):
|
| 13 |
+
input_text = f"question: {question} context: {context}"
|
| 14 |
+
result = qa_pipeline(input_text, max_length=200, do_sample=True)
|
| 15 |
+
return result[0]['generated_text']
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
transformers
|
| 3 |
+
torch
|
| 4 |
+
pymupdf
|