MiakOnline commited on
Commit
5590c40
·
verified ·
1 Parent(s): 190d269

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -68
app.py CHANGED
@@ -1,22 +1,19 @@
1
  import streamlit as st
2
  from pypdf import PdfReader
3
  from docx import Document
4
- import tempfile
5
- import requests
6
- from gtts import gTTS
7
-
8
  from PIL import Image
 
 
 
9
 
10
  from langchain.vectorstores import FAISS
11
  from langchain.embeddings import HuggingFaceEmbeddings
12
  from langchain.text_splitter import CharacterTextSplitter
13
  from langchain.prompts import PromptTemplate
14
  from langchain.llms import HuggingFacePipeline
15
-
16
  from transformers import pipeline
17
 
18
-
19
- # Setup HuggingFace pipeline with distilgpt2 (CPU)
20
  text_gen_pipeline = pipeline(
21
  "text-generation",
22
  model="distilgpt2",
@@ -24,70 +21,50 @@ text_gen_pipeline = pipeline(
24
  )
25
  llm = HuggingFacePipeline(pipeline=text_gen_pipeline)
26
 
27
- # Streamlit app config
28
  st.set_page_config(page_title="Learning with Fun", layout="wide")
29
  st.title("📘 Learning with Fun - Kids QA App")
30
  st.markdown("Ask questions from your syllabus! 📚")
31
 
32
- # Sidebar widgets
33
  grade = st.sidebar.selectbox("Select Grade", ["Grade 5", "Grade 6"])
34
  subject = st.sidebar.selectbox("Select Subject", ["Science", "Math", "Computer", "Islamiyat"])
35
  mode = st.sidebar.radio("Answer Format", ["🧠 Beginner Explanation", "📖 Storytelling"])
36
  voice_enabled = st.sidebar.checkbox("🔈 Enable Voice", value=True)
37
 
38
- # Fetch syllabus file from Google Drive link
39
- def fetch_from_gdrive(link: str) -> str | None:
40
- file_id = None
41
- if "id=" in link:
42
- file_id = link.split("id=")[1].split("&")[0]
43
- elif "/d/" in link:
44
- file_id = link.split("/d/")[1].split("/")[0]
45
- if not file_id:
46
- return None
47
 
48
- url = f"https://drive.google.com/uc?export=download&id={file_id}"
49
- response = requests.get(url)
50
- if response.status_code == 200:
51
- tmp_file = tempfile.NamedTemporaryFile(delete=False)
52
- tmp_file.write(response.content)
53
- tmp_file.close()
54
- return tmp_file.name
55
- return None
56
-
57
- uploaded_file = None
58
- file_link = st.text_input("Paste Google Drive Link to Syllabus File (.pdf or .docx)")
59
-
60
- if file_link:
61
- filepath = fetch_from_gdrive(file_link)
62
- if filepath:
63
- uploaded_file = filepath
64
- else:
65
- st.error("Invalid Google Drive link or download error.")
66
-
67
- # Extract text content from uploaded file
68
- def extract_text(file_path: str) -> str:
69
  text = ""
70
- if file_path.endswith(".pdf"):
71
- reader = PdfReader(file_path)
72
- for page in reader.pages:
73
- page_text = page.extract_text()
74
- if page_text:
75
- text += page_text
76
- elif file_path.endswith(".docx"):
77
- doc = Document(file_path)
78
- for para in doc.paragraphs:
79
- text += para.text + "\n"
 
 
 
 
 
 
 
80
  else:
81
- st.error("Unsupported file format. Please upload a PDF or DOCX file.")
82
  return text
83
 
84
- # Create vector store for similarity search
85
  def create_vectorstore(text: str) -> FAISS:
86
  splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
87
  docs = splitter.create_documents([text])
88
  embeddings = HuggingFaceEmbeddings()
89
- vectorstore = FAISS.from_documents(docs, embeddings)
90
- return vectorstore
91
 
92
  # Prompt templates
93
  story_prompt = PromptTemplate.from_template(
@@ -102,40 +79,35 @@ explain_prompt = PromptTemplate.from_template(
102
  "براہ کرم بچے کو اردو زبان میں آسان انداز میں سمجھائیں۔"
103
  )
104
 
105
- # Generate speech audio from text
106
  def generate_voice(text: str, lang='ur') -> str:
107
  tts = gTTS(text=text, lang=lang)
108
  tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
109
  tts.save(tts_file.name)
110
  return tts_file.name
111
 
112
- # Generate answer using vectorstore context and LLM
113
  def get_answer(query: str, vectorstore: FAISS, mode: str) -> str:
114
  retriever = vectorstore.as_retriever()
115
  docs = retriever.get_relevant_documents(query)
116
  context = "\n".join([doc.page_content for doc in docs])
117
 
118
- if mode == "📖 Storytelling":
119
- prompt = story_prompt.format(question=query, context=context)
120
- else:
121
- prompt = explain_prompt.format(question=query, context=context)
122
-
123
- answer = llm.invoke(prompt)
124
- return answer
125
 
126
- # Main app flow
127
  if uploaded_file:
128
- raw_text = extract_text(uploaded_file)
129
  if not raw_text.strip():
130
- st.error("No text extracted from the file. Please check the file content.")
131
  else:
132
- st.success("📄 Syllabus loaded successfully!")
133
  query = st.text_input("❓ Ask your question (Urdu or English)")
134
  if query:
135
  with st.spinner("Thinking..."):
136
  vectorstore = create_vectorstore(raw_text)
137
  answer = get_answer(query, vectorstore, mode)
138
- st.markdown("### Answer:")
139
  st.write(answer)
140
 
141
  if voice_enabled:
@@ -143,4 +115,4 @@ if uploaded_file:
143
  with open(audio_file, "rb") as audio:
144
  st.audio(audio.read(), format="audio/mp3")
145
  else:
146
- st.info("Please paste a Google Drive link to your syllabus file (.pdf or .docx) above.")
 
1
  import streamlit as st
2
  from pypdf import PdfReader
3
  from docx import Document
 
 
 
 
4
  from PIL import Image
5
+ import pytesseract
6
+ from gtts import gTTS
7
+ import tempfile
8
 
9
  from langchain.vectorstores import FAISS
10
  from langchain.embeddings import HuggingFaceEmbeddings
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from langchain.prompts import PromptTemplate
13
  from langchain.llms import HuggingFacePipeline
 
14
  from transformers import pipeline
15
 
16
+ # Setup HuggingFace pipeline
 
17
  text_gen_pipeline = pipeline(
18
  "text-generation",
19
  model="distilgpt2",
 
21
  )
22
  llm = HuggingFacePipeline(pipeline=text_gen_pipeline)
23
 
24
+ # Streamlit page config
25
  st.set_page_config(page_title="Learning with Fun", layout="wide")
26
  st.title("📘 Learning with Fun - Kids QA App")
27
  st.markdown("Ask questions from your syllabus! 📚")
28
 
29
+ # Sidebar options
30
  grade = st.sidebar.selectbox("Select Grade", ["Grade 5", "Grade 6"])
31
  subject = st.sidebar.selectbox("Select Subject", ["Science", "Math", "Computer", "Islamiyat"])
32
  mode = st.sidebar.radio("Answer Format", ["🧠 Beginner Explanation", "📖 Storytelling"])
33
  voice_enabled = st.sidebar.checkbox("🔈 Enable Voice", value=True)
34
 
35
+ # File upload widget
36
+ uploaded_file = st.file_uploader("📁 Upload Syllabus File (.pdf, .docx, .jpg, .jpeg, .png)", type=["pdf", "docx", "jpg", "jpeg", "png"])
 
 
 
 
 
 
 
37
 
38
+ # Extract text from file
39
+ def extract_text_from_file(file) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  text = ""
41
+ if file.name.endswith(".pdf"):
42
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
43
+ tmp_file.write(file.read())
44
+ reader = PdfReader(tmp_file.name)
45
+ for page in reader.pages:
46
+ page_text = page.extract_text()
47
+ if page_text:
48
+ text += page_text
49
+ elif file.name.endswith(".docx"):
50
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp_file:
51
+ tmp_file.write(file.read())
52
+ doc = Document(tmp_file.name)
53
+ for para in doc.paragraphs:
54
+ text += para.text + "\n"
55
+ elif file.name.lower().endswith((".jpg", ".jpeg", ".png")):
56
+ img = Image.open(file)
57
+ text = pytesseract.image_to_string(img, lang='eng+urd')
58
  else:
59
+ st.error("Unsupported file format.")
60
  return text
61
 
62
+ # Create vector DB
63
  def create_vectorstore(text: str) -> FAISS:
64
  splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
65
  docs = splitter.create_documents([text])
66
  embeddings = HuggingFaceEmbeddings()
67
+ return FAISS.from_documents(docs, embeddings)
 
68
 
69
  # Prompt templates
70
  story_prompt = PromptTemplate.from_template(
 
79
  "براہ کرم بچے کو اردو زبان میں آسان انداز میں سمجھائیں۔"
80
  )
81
 
82
+ # Text-to-speech
83
  def generate_voice(text: str, lang='ur') -> str:
84
  tts = gTTS(text=text, lang=lang)
85
  tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
86
  tts.save(tts_file.name)
87
  return tts_file.name
88
 
89
+ # Answer generation
90
  def get_answer(query: str, vectorstore: FAISS, mode: str) -> str:
91
  retriever = vectorstore.as_retriever()
92
  docs = retriever.get_relevant_documents(query)
93
  context = "\n".join([doc.page_content for doc in docs])
94
 
95
+ prompt = story_prompt.format(question=query, context=context) if mode == "📖 Storytelling" else explain_prompt.format(question=query, context=context)
96
+ return llm.invoke(prompt)
 
 
 
 
 
97
 
98
+ # Main logic
99
  if uploaded_file:
100
+ raw_text = extract_text_from_file(uploaded_file)
101
  if not raw_text.strip():
102
+ st.error("⚠️ No text found in the uploaded file.")
103
  else:
104
+ st.success(" Syllabus loaded successfully!")
105
  query = st.text_input("❓ Ask your question (Urdu or English)")
106
  if query:
107
  with st.spinner("Thinking..."):
108
  vectorstore = create_vectorstore(raw_text)
109
  answer = get_answer(query, vectorstore, mode)
110
+ st.markdown("### 📘 Answer:")
111
  st.write(answer)
112
 
113
  if voice_enabled:
 
115
  with open(audio_file, "rb") as audio:
116
  st.audio(audio.read(), format="audio/mp3")
117
  else:
118
+ st.info("📂 Please upload your syllabus file to begin.")