MiakOnline commited on
Commit
d2728e2
·
verified ·
1 Parent(s): 3dc53a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -70
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import streamlit as st
2
  from pypdf import PdfReader
3
  from docx import Document
4
- import tempfile
5
- from gtts import gTTS
6
-
7
  from PIL import Image
 
 
 
8
 
9
- from langchain.vectorstores import FAISS
10
  from langchain.embeddings import HuggingFaceEmbeddings
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from langchain.prompts import PromptTemplate
@@ -14,71 +14,84 @@ from langchain.llms import HuggingFacePipeline
14
 
15
  from transformers import pipeline
16
 
17
-
18
- # Setup HuggingFace pipeline with distilgpt2 (CPU)
19
- # text_gen_pipeline = pipeline(
20
- # "text-generation",
21
- # model="distilgpt2",
22
- # device=-1 # CPU only
23
- #)
24
  text_gen_pipeline = pipeline(
25
  "text-generation",
26
  model="distilgpt2",
27
- device=-1, # CPU only
28
- max_new_tokens=150, # Increased from default
29
- do_sample=True,
30
- temperature=0.7,
31
- top_k=50,
32
- top_p=0.95
33
  )
34
  llm = HuggingFacePipeline(pipeline=text_gen_pipeline)
35
 
36
- # Streamlit app config
37
- st.set_page_config(page_title="Learning with Fun", layout="wide")
38
- st.title("📘 Learning with Fun - Kids QA App")
39
- st.markdown("Ask questions from your syllabus! 📚")
40
-
41
- # Sidebar widgets
42
- grade = st.sidebar.selectbox("Select Grade", ["Grade 5", "Grade 6"])
43
- subject = st.sidebar.selectbox("Select Subject", ["Science", "Math", "Computer", "Islamiyat"])
44
- mode = st.sidebar.radio("Answer Format", ["🧠 Beginner Explanation", "📖 Storytelling"])
45
- voice_enabled = st.sidebar.checkbox("🔈 Enable Voice", value=True)
46
-
47
- # File uploader for syllabus
48
- uploaded_file = st.file_uploader(
49
- "Upload your syllabus file (PDF, DOCX, JPEG, PNG, JPG)",
50
- type=["pdf", "docx", "jpeg", "png", "jpg"]
51
- )
52
-
53
- # Extract text content from uploaded file directly
54
- def extract_text_from_uploaded(file) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  text = ""
56
- if file is None:
57
- return text
58
-
59
  if file.type == "application/pdf":
60
  try:
61
- reader = PdfReader(file)
62
- for page in reader.pages:
63
- page_text = page.extract_text()
64
- if page_text:
65
- text += page_text
 
 
 
66
  except Exception as e:
67
- st.error(f"Error reading PDF file: {e}")
68
  elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
69
- try:
70
- doc = Document(file)
71
- for para in doc.paragraphs:
72
- text += para.text + "\n"
73
- except Exception as e:
74
- st.error(f"Error reading DOCX file: {e}")
75
  elif file.type in ["image/jpeg", "image/png"]:
76
- st.warning("Image files currently are not supported for text extraction.")
 
 
 
 
 
77
  else:
78
  st.error("Unsupported file format.")
79
- return text
80
 
81
- # Create vector store for similarity search
82
  def create_vectorstore(text: str) -> FAISS:
83
  splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
84
  docs = splitter.create_documents([text])
@@ -86,7 +99,7 @@ def create_vectorstore(text: str) -> FAISS:
86
  vectorstore = FAISS.from_documents(docs, embeddings)
87
  return vectorstore
88
 
89
- # Prompt templates
90
  story_prompt = PromptTemplate.from_template(
91
  "ایک طالب علم نے سوال کیا: {question}\n"
92
  "نصاب کی معلومات: {context}\n"
@@ -99,14 +112,14 @@ explain_prompt = PromptTemplate.from_template(
99
  "براہ کرم بچے کو اردو زبان میں آسان انداز میں سمجھائیں۔"
100
  )
101
 
102
- # Generate speech audio from text
103
  def generate_voice(text: str, lang='ur') -> str:
104
  tts = gTTS(text=text, lang=lang)
105
  tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
106
  tts.save(tts_file.name)
107
  return tts_file.name
108
 
109
- # Generate answer using vectorstore context and LLM
110
  def get_answer(query: str, vectorstore: FAISS, mode: str) -> str:
111
  retriever = vectorstore.as_retriever()
112
  docs = retriever.get_relevant_documents(query)
@@ -117,27 +130,26 @@ def get_answer(query: str, vectorstore: FAISS, mode: str) -> str:
117
  else:
118
  prompt = explain_prompt.format(question=query, context=context)
119
 
120
- answer = llm.invoke(prompt)
121
- return answer
122
 
123
- # Main app flow
124
  if uploaded_file:
125
- raw_text = extract_text_from_uploaded(uploaded_file)
126
- if not raw_text.strip():
127
- st.error("No text extracted from the file. Please check the file content.")
128
  else:
129
- st.success("📄 Syllabus loaded successfully!")
130
- query = st.text_input(" Ask your question (Urdu or English)")
131
  if query:
132
- with st.spinner("Thinking..."):
133
  vectorstore = create_vectorstore(raw_text)
134
  answer = get_answer(query, vectorstore, mode)
135
  st.markdown("### ✅ Answer:")
136
  st.write(answer)
137
 
138
  if voice_enabled:
139
- audio_file = generate_voice(answer)
140
- with open(audio_file, "rb") as audio:
141
- st.audio(audio.read(), format="audio/mp3")
142
  else:
143
- st.info("Please upload a syllabus file above.")
 
1
  import streamlit as st
2
  from pypdf import PdfReader
3
  from docx import Document
 
 
 
4
  from PIL import Image
5
+ from gtts import gTTS
6
+ import tempfile
7
+ import io
8
 
9
+ from langchain_community.vectorstores import FAISS
10
  from langchain.embeddings import HuggingFaceEmbeddings
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from langchain.prompts import PromptTemplate
 
14
 
15
  from transformers import pipeline
16
 
17
+ # ------------------------ Setup HuggingFace LLM -----------------------
 
 
 
 
 
 
18
  text_gen_pipeline = pipeline(
19
  "text-generation",
20
  model="distilgpt2",
21
+ device=-1, # CPU
22
+ max_new_tokens=150
 
 
 
 
23
  )
24
  llm = HuggingFacePipeline(pipeline=text_gen_pipeline)
25
 
26
+ # -------------------------- Streamlit UI Setup -------------------------
27
+ st.set_page_config(page_title="Learning with Fun", layout="centered")
28
+
29
+ st.markdown("""
30
+ <style>
31
+ body {
32
+ background: linear-gradient(to right, #f9f9f9, #e0f7fa);
33
+ }
34
+ .stApp {
35
+ font-family: 'Segoe UI', sans-serif;
36
+ }
37
+ .title {
38
+ text-align: center;
39
+ font-size: 36px;
40
+ font-weight: bold;
41
+ color: #006064;
42
+ margin-bottom: 10px;
43
+ }
44
+ .subtext {
45
+ text-align: center;
46
+ font-size: 18px;
47
+ color: #00796B;
48
+ margin-bottom: 30px;
49
+ }
50
+ </style>
51
+ """, unsafe_allow_html=True)
52
+
53
+ st.markdown('<div class="title">📘 Learning with Fun</div>', unsafe_allow_html=True)
54
+ st.markdown('<div class="subtext">Ask questions from your syllabus in a fun way!</div>', unsafe_allow_html=True)
55
+
56
+ # -------------------------- Sidebar Controls ----------------------------
57
+ grade = st.sidebar.selectbox("🎓 Select Grade", ["Grade 5", "Grade 6"])
58
+ subject = st.sidebar.selectbox("📘 Select Subject", ["Science", "Math", "Computer", "Islamiyat"])
59
+ mode = st.sidebar.radio("🎯 Answer Format", ["🧠 Beginner Explanation", "📖 Storytelling"])
60
+ voice_enabled = st.sidebar.checkbox("🔈 Enable Voice Output", value=True)
61
+
62
+ # --------------------- File Upload and Text Extraction -------------------
63
+ uploaded_file = st.file_uploader("📂 Upload Syllabus File (PDF, DOCX, JPEG, PNG)", type=["pdf", "docx", "jpeg", "jpg", "png"])
64
+
65
+ def extract_text(file) -> str:
66
  text = ""
 
 
 
67
  if file.type == "application/pdf":
68
  try:
69
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
70
+ tmp.write(file.read())
71
+ tmp.seek(0)
72
+ reader = PdfReader(tmp.name)
73
+ for page in reader.pages:
74
+ page_text = page.extract_text()
75
+ if page_text:
76
+ text += page_text
77
  except Exception as e:
78
+ st.error(f"Failed to read PDF: {e}")
79
  elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
80
+ doc = Document(io.BytesIO(file.read()))
81
+ for para in doc.paragraphs:
82
+ text += para.text + "\n"
 
 
 
83
  elif file.type in ["image/jpeg", "image/png"]:
84
+ try:
85
+ import pytesseract
86
+ image = Image.open(file)
87
+ text = pytesseract.image_to_string(image)
88
+ except ImportError:
89
+ st.error("Please install pytesseract for image to text conversion.")
90
  else:
91
  st.error("Unsupported file format.")
92
+ return text.strip()
93
 
94
+ # -------------------- Create Vector Store -------------------------------
95
  def create_vectorstore(text: str) -> FAISS:
96
  splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
97
  docs = splitter.create_documents([text])
 
99
  vectorstore = FAISS.from_documents(docs, embeddings)
100
  return vectorstore
101
 
102
+ # ------------------------ Prompt Templates ------------------------------
103
  story_prompt = PromptTemplate.from_template(
104
  "ایک طالب علم نے سوال کیا: {question}\n"
105
  "نصاب کی معلومات: {context}\n"
 
112
  "براہ کرم بچے کو اردو زبان میں آسان انداز میں سمجھائیں۔"
113
  )
114
 
115
+ # -------------------------- TTS Generator -------------------------------
116
  def generate_voice(text: str, lang='ur') -> str:
117
  tts = gTTS(text=text, lang=lang)
118
  tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
119
  tts.save(tts_file.name)
120
  return tts_file.name
121
 
122
+ # -------------------------- Answer Generator ----------------------------
123
  def get_answer(query: str, vectorstore: FAISS, mode: str) -> str:
124
  retriever = vectorstore.as_retriever()
125
  docs = retriever.get_relevant_documents(query)
 
130
  else:
131
  prompt = explain_prompt.format(question=query, context=context)
132
 
133
+ result = llm.invoke(prompt)
134
+ return result.strip()
135
 
136
+ # ----------------------------- Main Logic -------------------------------
137
  if uploaded_file:
138
+ raw_text = extract_text(uploaded_file)
139
+ if not raw_text:
140
+ st.error("No text extracted from file.")
141
  else:
142
+ st.success(" Syllabus loaded successfully!")
143
+ query = st.text_input("💬 Ask a question (Urdu or English):")
144
  if query:
145
+ with st.spinner("🤔 Thinking..."):
146
  vectorstore = create_vectorstore(raw_text)
147
  answer = get_answer(query, vectorstore, mode)
148
  st.markdown("### ✅ Answer:")
149
  st.write(answer)
150
 
151
  if voice_enabled:
152
+ audio_path = generate_voice(answer)
153
+ st.audio(audio_path, format="audio/mp3")
 
154
  else:
155
+ st.info("Please upload your syllabus file to begin.")