andevs commited on
Commit
a01f67d
Β·
verified Β·
1 Parent(s): b2863f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +729 -189
app.py CHANGED
@@ -1,7 +1,8 @@
1
  """
2
- StudyFlow AI Backend - AI-Powered Question Generation
3
- Uses Hugging Face Inference API for intelligent question generation
4
  """
 
5
  import os
6
  import json
7
  import sqlite3
@@ -9,19 +10,21 @@ import hashlib
9
  import tempfile
10
  import re
11
  import requests
 
12
  from datetime import datetime
13
- from typing import List, Dict, Optional
14
- from fastapi import FastAPI, UploadFile, File, Form, HTTPException
15
- from fastapi.responses import JSONResponse, HTMLResponse
16
  from fastapi.middleware.cors import CORSMiddleware
17
  from fastapi.staticfiles import StaticFiles
18
  import PyPDF2
19
  from youtube_transcript_api import YouTubeTranscriptApi
 
20
 
21
  # Initialize FastAPI
22
- app = FastAPI(title="StudyFlow AI", version="3.0.0")
23
 
24
- # CORS middleware
25
  app.add_middleware(
26
  CORSMiddleware,
27
  allow_origins=["*"],
@@ -30,19 +33,26 @@ app.add_middleware(
30
  allow_headers=["*"],
31
  )
32
 
33
- # Hugging Face API configuration
 
 
34
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
35
- HF_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
 
 
 
36
 
37
  # Database setup
38
  DB_PATH = "/data/studyflow.db" if os.path.exists("/data") else "studyflow.db"
39
 
 
 
40
  def init_db():
41
- """Initialize SQLite database"""
42
  conn = sqlite3.connect(DB_PATH)
43
  cursor = conn.cursor()
44
 
45
- # Sessions table with page selections
46
  cursor.execute('''
47
  CREATE TABLE IF NOT EXISTS sessions (
48
  id TEXT PRIMARY KEY,
@@ -57,7 +67,7 @@ def init_db():
57
  )
58
  ''')
59
 
60
- # Questions table
61
  cursor.execute('''
62
  CREATE TABLE IF NOT EXISTS questions (
63
  id TEXT PRIMARY KEY,
@@ -77,47 +87,114 @@ def init_db():
77
  )
78
  ''')
79
 
80
- # Pages table for PDF page content
81
  cursor.execute('''
82
  CREATE TABLE IF NOT EXISTS pages (
83
  id TEXT PRIMARY KEY,
84
  session_id TEXT NOT NULL,
85
  page_number INTEGER NOT NULL,
86
  content TEXT NOT NULL,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  FOREIGN KEY (session_id) REFERENCES sessions (id) ON DELETE CASCADE
88
  )
89
  ''')
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  conn.commit()
92
  conn.close()
 
93
 
 
94
  init_db()
95
 
96
- def generate_id(text: str = None):
 
 
97
  """Generate a unique ID"""
98
- import uuid
99
- if text:
100
- return hashlib.md5(text.encode()).hexdigest()[:12]
101
- return str(uuid.uuid4())[:12]
102
 
103
  def extract_text_from_pdf(file_path: str) -> Dict[int, str]:
104
- """Extract text from PDF file and return pages dictionary"""
 
 
 
105
  pages_text = {}
106
  try:
107
  with open(file_path, 'rb') as file:
108
  pdf_reader = PyPDF2.PdfReader(file)
 
 
109
  for page_num, page in enumerate(pdf_reader.pages, start=1):
110
- page_text = page.extract_text()
111
- if page_text and len(page_text.strip()) > 50:
112
- pages_text[page_num] = page_text.strip()
113
- return pages_text
 
 
 
 
 
 
 
 
 
 
114
  except Exception as e:
115
- print(f"PDF extraction error: {str(e)}")
116
  return {}
117
 
118
- def extract_text_from_youtube(url: str) -> str:
119
- """Extract transcript from YouTube video"""
 
 
120
  try:
 
121
  if "youtube.com/watch?v=" in url:
122
  video_id = url.split("v=")[-1].split("&")[0]
123
  elif "youtu.be/" in url:
@@ -125,183 +202,414 @@ def extract_text_from_youtube(url: str) -> str:
125
  else:
126
  return ""
127
 
128
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
129
- text = " ".join([entry['text'] for entry in transcript])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  return text
 
 
 
 
 
 
 
131
  except Exception as e:
132
- print(f"YouTube extraction error: {str(e)}")
133
  return ""
134
 
135
- def call_hf_api(prompt: str, max_length: int = 500) -> Optional[str]:
136
- """Call Hugging Face API for AI-powered question generation"""
137
- if not HF_API_TOKEN:
 
 
 
 
138
  return None
139
 
140
  try:
141
- headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
 
 
 
 
142
  payload = {
143
  "inputs": prompt,
144
  "parameters": {
145
  "max_new_tokens": max_length,
146
- "temperature": 0.7,
147
- "top_p": 0.9,
148
- "do_sample": True
 
149
  }
150
  }
151
- response = requests.post(HF_API_URL, headers=headers, json=payload, timeout=30)
 
 
 
152
  if response.status_code == 200:
153
  result = response.json()
154
- return result[0].get("generated_text", "")
 
 
 
 
 
 
 
 
155
  return None
156
  except Exception as e:
157
- print(f"HF API error: {str(e)}")
158
  return None
159
 
160
  def generate_questions_with_ai(content: str, difficulty: str, count: int, page_ref: int = None) -> List[Dict]:
161
- """Generate intelligent questions using AI"""
 
 
 
 
 
 
 
 
162
 
163
- # Build prompt for AI
164
- difficulty_prompts = {
165
- "easy": "Generate basic recall and definition questions that test understanding of key terms and simple facts.",
166
- "medium": "Generate conceptual questions that test understanding of relationships, causes, effects, and comparisons.",
167
- "hard": "Generate analytical questions that require critical thinking, application, evaluation, and synthesis of ideas."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  }
169
 
170
- prompt = f"""You are an expert educator creating study questions. Based on the following text, generate {count} {difficulty}-level questions.
171
 
172
- {difficulty_prompts[difficulty]}
 
173
 
174
- For each question, provide:
175
- 1. The question text
176
- 2. Question type (multiple_choice, true_false, or short_answer)
177
- 3. For multiple choice: 4 options (A, B, C, D) with one correct
178
- 4. For true/false: the correct answer
179
- 5. For short answer: a model answer
180
- 6. A brief explanation of why the answer is correct
181
 
182
- Format your response as JSON array:
183
  [
184
  {{
185
- "text": "question text",
186
- "type": "multiple_choice",
187
- "options": ["option1", "option2", "option3", "option4"],
188
- "correct_answer": "the correct option text",
189
- "explanation": "explanation here"
190
  }}
191
  ]
192
 
193
- TEXT CONTENT:
194
- {content[:3000]}
 
 
195
 
196
- Generate {count} questions in JSON format:"""
197
 
198
- ai_response = call_hf_api(prompt, 2000)
 
199
 
200
  if ai_response:
201
  try:
202
  # Extract JSON from response
203
- json_match = re.search(r'\[[\s\S]*\]', ai_response)
204
  if json_match:
205
  questions_data = json.loads(json_match.group())
206
  questions = []
207
  for i, q_data in enumerate(questions_data[:count]):
208
- questions.append({
209
  "id": generate_id(f"q_{i}"),
210
  "question_text": q_data.get("text", ""),
211
  "question_type": q_data.get("type", "short_answer"),
212
  "options": json.dumps(q_data.get("options", [])) if q_data.get("options") else None,
213
- "correct_answer": q_data.get("correct_answer", ""),
214
  "difficulty": difficulty,
215
- "explanation": q_data.get("explanation", "Review the material for this answer."),
216
  "page_reference": page_ref
217
- })
 
 
 
218
  if questions:
 
219
  return questions
220
- except:
221
- pass
 
 
222
 
223
- # Fallback to intelligent template-based generation
 
224
  return generate_questions_fallback(content, difficulty, count, page_ref)
225
 
226
  def generate_questions_fallback(content: str, difficulty: str, count: int, page_ref: int = None) -> List[Dict]:
227
- """Enhanced fallback question generation with better intelligence"""
 
 
 
228
 
229
- # Extract sentences, facts, and concepts
 
 
 
230
  sentences = re.split(r'[.!?]+', content)
231
- sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
 
 
 
 
 
 
 
232
 
233
- # Extract numbers/dates
234
- numbers = re.findall(r'\b\d{4}\b|\b\d+\.\d+\b|\b\d+%\b|\b\d+\s+(?:percent|million|billion|thousand)\b', content)
 
 
235
 
236
- # Extract proper nouns (potential key terms)
237
- proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', content)
 
 
 
 
 
 
238
 
239
  questions = []
240
 
241
- for i in range(min(count, max(len(sentences), 5))):
242
- sentence = sentences[i % len(sentences)] if sentences else content[:200]
243
- qid = generate_id(f"q_{i}")
244
-
245
- if difficulty == "easy" and len(numbers) > 0:
246
- # Number-based question
247
- num = numbers[i % len(numbers)]
248
- questions.append({
249
- "id": qid,
250
- "question_text": f"What is the significance of {num} in the context of this material?",
251
- "question_type": "short_answer",
252
- "options": None,
253
- "correct_answer": f"The number {num} represents a key figure or measurement discussed in the text.",
254
- "difficulty": "easy",
255
- "explanation": "Look for context around this number in the material.",
256
- "page_reference": page_ref
257
- })
258
- elif difficulty == "easy" and proper_nouns:
259
- # Term definition question
260
- term = proper_nouns[i % len(proper_nouns)]
261
- questions.append({
262
- "id": qid,
263
- "question_text": f"What does the term '{term}' refer to in this material?",
264
- "question_type": "short_answer",
265
- "options": None,
266
- "correct_answer": f"'{term}' is a key term discussed in the material that relates to the main topic.",
267
- "difficulty": "easy",
268
- "explanation": f"Look for definitions or context around '{term}' in the text.",
269
- "page_reference": page_ref
270
- })
271
- elif difficulty == "medium":
272
- # Conceptual question with multiple choice
273
- words = sentence.split()
274
- key_word = next((w for w in words if len(w) > 5), "the concept")
 
 
275
  options = [
276
- f"The main idea about {key_word} is clearly explained",
277
- f"A minor detail mentioned in passing",
278
- f"An unrelated example provided for context",
279
  f"The conclusion drawn from the discussion"
280
  ]
 
281
  questions.append({
282
- "id": qid,
283
- "question_text": f"Based on the text: \"{sentence[:150]}...\" What is the main idea being conveyed?",
284
  "question_type": "multiple_choice",
285
  "options": json.dumps(options),
286
  "correct_answer": options[0],
287
  "difficulty": "medium",
288
- "explanation": f"The text emphasizes {key_word} as an important concept.",
289
  "page_reference": page_ref
290
  })
291
- else:
292
- # Analysis question
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  questions.append({
294
- "id": qid,
295
- "question_text": f"Analyze the following statement and explain its implications: \"{sentence[:200]}...\"",
296
  "question_type": "short_answer",
297
  "options": None,
298
- "correct_answer": f"This statement suggests that {sentence[:100]} which has important implications for understanding the broader context.",
299
  "difficulty": "hard",
300
- "explanation": "Critical analysis requires considering causes, effects, and connections to other concepts.",
301
  "page_reference": page_ref
302
  })
303
 
304
- return questions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  @app.post("/api/process-content")
307
  async def process_content(
@@ -311,96 +619,143 @@ async def process_content(
311
  content: str = Form(None),
312
  file: UploadFile = File(None),
313
  youtube_url: str = Form(None),
314
- selected_pages: str = Form(None), # JSON string of selected page numbers
315
- time_start: float = Form(None), # For YouTube time selection
316
- time_end: float = Form(None)
 
317
  ):
318
- """Process uploaded content with page/segment selection"""
 
 
 
 
 
319
 
320
- session_id = generate_id(title)
321
  text_content = ""
322
  pages_dict = {}
323
  total_pages = 0
324
  selected_pages_list = []
325
 
326
  try:
 
327
  if content_type == "text":
328
- text_content = content[:20000] if content else ""
 
 
 
329
 
330
- elif content_type == "pdf" and file:
 
 
 
 
331
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
332
  content_bytes = await file.read()
333
  temp_file.write(content_bytes)
334
  temp_file_path = temp_file.name
335
 
 
336
  pages_dict = extract_text_from_pdf(temp_file_path)
337
  os.unlink(temp_file_path)
 
338
  total_pages = len(pages_dict)
339
 
340
  # Parse selected pages
341
  if selected_pages:
342
- selected_pages_list = json.loads(selected_pages)
343
- else:
 
 
 
 
 
344
  selected_pages_list = list(pages_dict.keys())
345
 
346
- # Combine selected pages into text
347
- for page_num in selected_pages_list:
348
  if page_num in pages_dict:
349
  text_content += f"\n--- Page {page_num} ---\n{pages_dict[page_num]}\n"
350
 
351
- elif content_type == "youtube" and youtube_url:
352
- full_transcript = extract_text_from_youtube(youtube_url)
353
- # Handle time selection
354
- if time_start is not None and time_end is not None:
355
- # Would need timestamp-based transcript filtering
356
- text_content = full_transcript
357
- else:
358
- text_content = full_transcript
 
 
 
 
 
 
359
 
 
360
  if len(text_content) < 100:
361
- raise HTTPException(status_code=400, detail="Content too short. Please provide at least 100 characters.")
362
 
363
  # Generate questions
364
- questions = generate_questions_with_ai(text_content, difficulty, 5, None)
 
 
 
 
365
 
366
  # Save to database
367
  conn = sqlite3.connect(DB_PATH)
368
  cursor = conn.cursor()
369
 
370
  # Save session
371
- cursor.execute(
372
- """INSERT OR REPLACE INTO sessions
373
- (id, title, content_type, difficulty, selected_pages, total_pages, last_accessed)
374
- VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)""",
375
- (session_id, title, content_type, difficulty,
376
- json.dumps(selected_pages_list) if selected_pages_list else None,
377
- total_pages)
378
- )
379
 
380
  # Save pages
381
  for page_num, page_content in pages_dict.items():
382
- cursor.execute(
383
- "INSERT OR REPLACE INTO pages (id, session_id, page_number, content) VALUES (?, ?, ?, ?)",
384
- (generate_id(f"page_{page_num}"), session_id, page_num, page_content[:5000])
385
- )
386
 
387
  # Save questions
388
  for q in questions:
389
- cursor.execute(
390
- """INSERT INTO questions
391
- (id, session_id, question_text, question_type, options, correct_answer, difficulty, explanation, page_reference)
392
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
393
- (q["id"], session_id, q["question_text"], q["question_type"],
394
- q.get("options"), q["correct_answer"], q["difficulty"], q.get("explanation", ""),
395
- q.get("page_reference"))
396
- )
 
 
 
 
 
 
 
 
 
 
 
397
 
398
  conn.commit()
399
  conn.close()
400
 
 
 
401
  return {
 
402
  "session_id": session_id,
403
  "question_count": len(questions),
 
404
  "total_pages": total_pages,
405
  "selected_pages": selected_pages_list
406
  }
@@ -408,16 +763,20 @@ async def process_content(
408
  except HTTPException:
409
  raise
410
  except Exception as e:
411
- print(f"Error: {str(e)}")
 
 
412
  raise HTTPException(status_code=500, detail=str(e))
413
 
414
  @app.get("/api/session/{session_id}")
415
  async def get_session(session_id: str):
416
- """Get session with all materials"""
 
417
  conn = sqlite3.connect(DB_PATH)
418
  conn.row_factory = sqlite3.Row
419
  cursor = conn.cursor()
420
 
 
421
  cursor.execute("SELECT * FROM sessions WHERE id = ?", (session_id,))
422
  session = cursor.fetchone()
423
 
@@ -425,32 +784,54 @@ async def get_session(session_id: str):
425
  conn.close()
426
  raise HTTPException(status_code=404, detail="Session not found")
427
 
428
- cursor.execute("SELECT * FROM questions WHERE session_id = ?", (session_id,))
 
 
 
 
429
  questions = [dict(row) for row in cursor.fetchall()]
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  cursor.execute("SELECT * FROM pages WHERE session_id = ? ORDER BY page_number", (session_id,))
432
  pages = [dict(row) for row in cursor.fetchall()]
433
 
 
434
  total_questions = len(questions)
435
  correct_answers = sum(1 for q in questions if q.get("is_correct") == 1)
436
  accuracy = round((correct_answers / total_questions * 100) if total_questions > 0 else 0, 1)
437
 
 
438
  conn.close()
439
 
440
  return {
441
  "session": dict(session),
442
- "pages": pages,
443
  "questions": questions,
 
 
444
  "performance": {
445
  "total_questions": total_questions,
446
  "correct_answers": correct_answers,
447
- "accuracy": accuracy
 
448
  }
449
  }
450
 
451
  @app.get("/api/user/sessions")
452
  async def get_user_sessions():
453
- """Get all user sessions"""
 
454
  conn = sqlite3.connect(DB_PATH)
455
  conn.row_factory = sqlite3.Row
456
  cursor = conn.cursor()
@@ -458,7 +839,19 @@ async def get_user_sessions():
458
  cursor.execute("SELECT * FROM sessions ORDER BY last_accessed DESC")
459
  sessions = [dict(row) for row in cursor.fetchall()]
460
 
 
 
 
 
 
 
 
 
 
 
 
461
  conn.close()
 
462
  return {"sessions": sessions}
463
 
464
  @app.post("/api/submit-answer")
@@ -468,10 +861,12 @@ async def submit_answer(
468
  user_answer: str = Form(...),
469
  time_spent: int = Form(0)
470
  ):
471
- """Submit an answer for evaluation"""
 
472
  conn = sqlite3.connect(DB_PATH)
473
  cursor = conn.cursor()
474
 
 
475
  cursor.execute("SELECT correct_answer, question_type FROM questions WHERE id = ? AND session_id = ?",
476
  (question_id, session_id))
477
  result = cursor.fetchone()
@@ -483,24 +878,49 @@ async def submit_answer(
483
  correct_answer = result[0]
484
  question_type = result[1]
485
 
486
- # Evaluate answer
487
  is_correct = 0
 
488
  if question_type == "multiple_choice":
 
489
  is_correct = 1 if user_answer.strip() == correct_answer.strip() else 0
 
490
  elif question_type == "true_false":
 
491
  is_correct = 1 if user_answer.strip().lower() == correct_answer.strip().lower() else 0
492
- else:
 
 
 
 
 
 
 
493
  # Smart evaluation for short answers
494
- user_lower = user_answer.strip().lower()
495
- correct_lower = correct_answer.strip().lower()
496
- keywords = re.findall(r'\b[a-z]{4,}\b', correct_lower)
497
- matches = sum(1 for kw in keywords if kw in user_lower)
498
- is_correct = 1 if matches >= len(keywords) * 0.3 or len(user_lower) > 40 else 0
 
 
 
 
 
 
 
 
 
499
 
500
- cursor.execute(
501
- "UPDATE questions SET user_answer = ?, is_correct = ?, time_spent = ? WHERE id = ?",
502
- (user_answer, is_correct, time_spent, question_id)
503
- )
 
 
 
 
 
504
 
505
  conn.commit()
506
  conn.close()
@@ -508,13 +928,133 @@ async def submit_answer(
508
  return {
509
  "is_correct": bool(is_correct),
510
  "correct_answer": correct_answer,
511
- "feedback": "Correct!" if is_correct else f"The correct answer is: {correct_answer}"
512
  }
513
 
514
- @app.get("/health")
515
- async def health_check():
516
- return {"status": "healthy", "timestamp": datetime.now().isoformat()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
 
518
  if __name__ == "__main__":
519
  import uvicorn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  """
2
+ StudyFlow AI Backend - Complete Production Version
3
+ Features: AI-powered question generation, PDF page selection, YouTube transcript extraction, full database persistence
4
  """
5
+
6
  import os
7
  import json
8
  import sqlite3
 
10
  import tempfile
11
  import re
12
  import requests
13
+ import uuid
14
  from datetime import datetime
15
+ from typing import List, Dict, Optional, Tuple
16
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
17
+ from fastapi.responses import JSONResponse, HTMLResponse, FileResponse
18
  from fastapi.middleware.cors import CORSMiddleware
19
  from fastapi.staticfiles import StaticFiles
20
  import PyPDF2
21
  from youtube_transcript_api import YouTubeTranscriptApi
22
+ from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
23
 
24
  # Initialize FastAPI
25
+ app = FastAPI(title="StudyFlow AI", version="3.0.0", description="AI-Powered Study Assistant")
26
 
27
+ # CORS middleware - Allow all origins for development
28
  app.add_middleware(
29
  CORSMiddleware,
30
  allow_origins=["*"],
 
33
  allow_headers=["*"],
34
  )
35
 
36
+ # ==================== CONFIGURATION ====================
37
+
38
+ # Hugging Face API configuration (optional - will use fallback if not set)
39
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
40
+ HF_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
41
+ # Alternative models (uncomment to use):
42
+ # HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
43
+ # HF_API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-large"
44
 
45
  # Database setup
46
  DB_PATH = "/data/studyflow.db" if os.path.exists("/data") else "studyflow.db"
47
 
48
+ # ==================== DATABASE INITIALIZATION ====================
49
+
50
  def init_db():
51
+ """Initialize SQLite database with all required tables"""
52
  conn = sqlite3.connect(DB_PATH)
53
  cursor = conn.cursor()
54
 
55
+ # Sessions table - stores main session info
56
  cursor.execute('''
57
  CREATE TABLE IF NOT EXISTS sessions (
58
  id TEXT PRIMARY KEY,
 
67
  )
68
  ''')
69
 
70
+ # Questions table - stores all generated questions
71
  cursor.execute('''
72
  CREATE TABLE IF NOT EXISTS questions (
73
  id TEXT PRIMARY KEY,
 
87
  )
88
  ''')
89
 
90
+ # Pages table - stores individual page content from PDFs
91
  cursor.execute('''
92
  CREATE TABLE IF NOT EXISTS pages (
93
  id TEXT PRIMARY KEY,
94
  session_id TEXT NOT NULL,
95
  page_number INTEGER NOT NULL,
96
  content TEXT NOT NULL,
97
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
98
+ FOREIGN KEY (session_id) REFERENCES sessions (id) ON DELETE CASCADE
99
+ )
100
+ ''')
101
+
102
+ # Flashcards table
103
+ cursor.execute('''
104
+ CREATE TABLE IF NOT EXISTS flashcards (
105
+ id TEXT PRIMARY KEY,
106
+ session_id TEXT NOT NULL,
107
+ front TEXT NOT NULL,
108
+ back TEXT NOT NULL,
109
+ category TEXT,
110
+ difficulty TEXT,
111
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
112
+ FOREIGN KEY (session_id) REFERENCES sessions (id) ON DELETE CASCADE
113
+ )
114
+ ''')
115
+
116
+ # Notes table
117
+ cursor.execute('''
118
+ CREATE TABLE IF NOT EXISTS notes (
119
+ id TEXT PRIMARY KEY,
120
+ session_id TEXT NOT NULL,
121
+ title TEXT NOT NULL,
122
+ content TEXT NOT NULL,
123
+ tags TEXT,
124
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
125
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
126
  FOREIGN KEY (session_id) REFERENCES sessions (id) ON DELETE CASCADE
127
  )
128
  ''')
129
 
130
+ # User profile table for analytics
131
+ cursor.execute('''
132
+ CREATE TABLE IF NOT EXISTS user_profile (
133
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
134
+ total_questions_answered INTEGER DEFAULT 0,
135
+ total_correct_answers INTEGER DEFAULT 0,
136
+ total_study_time INTEGER DEFAULT 0,
137
+ total_sessions_created INTEGER DEFAULT 0,
138
+ last_active TIMESTAMP DEFAULT CURRENT_TIMESTAMP
139
+ )
140
+ ''')
141
+
142
+ # Create indexes for better performance
143
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_questions_session ON questions(session_id)')
144
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_pages_session ON pages(session_id)')
145
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_flashcards_session ON flashcards(session_id)')
146
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_accessed ON sessions(last_accessed)')
147
+
148
  conn.commit()
149
  conn.close()
150
+ print(f"βœ… Database initialized at: {DB_PATH}")
151
 
152
+ # Initialize database on startup
153
  init_db()
154
 
155
+ # ==================== HELPER FUNCTIONS ====================
156
+
157
+ def generate_id(prefix: str = "") -> str:
158
  """Generate a unique ID"""
159
+ unique_id = str(uuid.uuid4())[:12]
160
+ return f"{prefix}_{unique_id}" if prefix else unique_id
 
 
161
 
162
  def extract_text_from_pdf(file_path: str) -> Dict[int, str]:
163
+ """
164
+ Extract text from PDF file and return dictionary of page_number -> content
165
+ Detects page boundaries automatically even without explicit page numbers
166
+ """
167
  pages_text = {}
168
  try:
169
  with open(file_path, 'rb') as file:
170
  pdf_reader = PyPDF2.PdfReader(file)
171
+ total_pages = len(pdf_reader.pages)
172
+
173
  for page_num, page in enumerate(pdf_reader.pages, start=1):
174
+ try:
175
+ page_text = page.extract_text()
176
+ if page_text and len(page_text.strip()) > 30: # Only include pages with meaningful content
177
+ # Clean up the text
178
+ page_text = re.sub(r'\s+', ' ', page_text).strip()
179
+ pages_text[page_num] = page_text
180
+ else:
181
+ pages_text[page_num] = f"[Page {page_num} - No extractable text content]"
182
+ except Exception as e:
183
+ print(f"Error extracting page {page_num}: {str(e)}")
184
+ pages_text[page_num] = f"[Page {page_num} - Error extracting text]"
185
+
186
+ print(f"βœ… Extracted {len(pages_text)} pages from PDF (total pages: {total_pages})")
187
+ return pages_text
188
  except Exception as e:
189
+ print(f"❌ PDF extraction error: {str(e)}")
190
  return {}
191
 
192
+ def extract_text_from_youtube(url: str, start_time: float = None, end_time: float = None) -> str:
193
+ """
194
+ Extract transcript from YouTube video with optional time filtering
195
+ """
196
  try:
197
+ # Extract video ID from URL
198
  if "youtube.com/watch?v=" in url:
199
  video_id = url.split("v=")[-1].split("&")[0]
200
  elif "youtu.be/" in url:
 
202
  else:
203
  return ""
204
 
205
+ # Get transcript
206
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
207
+
208
+ # Filter by time if specified
209
+ if start_time is not None or end_time is not None:
210
+ filtered_transcript = []
211
+ for entry in transcript_list:
212
+ entry_time = entry['start']
213
+ if start_time is not None and entry_time < start_time:
214
+ continue
215
+ if end_time is not None and entry_time > end_time:
216
+ continue
217
+ filtered_transcript.append(entry)
218
+ transcript_list = filtered_transcript
219
+
220
+ # Combine text
221
+ text = " ".join([entry['text'] for entry in transcript_list])
222
+ print(f"βœ… Extracted {len(transcript_list)} segments from YouTube video")
223
  return text
224
+
225
+ except TranscriptsDisabled:
226
+ print("❌ Transcripts disabled for this video")
227
+ return ""
228
+ except NoTranscriptFound:
229
+ print("❌ No transcript found for this video")
230
+ return ""
231
  except Exception as e:
232
+ print(f"❌ YouTube extraction error: {str(e)}")
233
  return ""
234
 
235
+ def call_hf_api(prompt: str, max_length: int = 1000, temperature: float = 0.7) -> Optional[str]:
236
+ """
237
+ Call Hugging Face Inference API for AI-powered question generation
238
+ Returns None if API call fails (will use fallback)
239
+ """
240
+ if not HF_API_TOKEN or HF_API_TOKEN == "":
241
+ print("⚠️ No HF_API_TOKEN provided, using fallback question generation")
242
  return None
243
 
244
  try:
245
+ headers = {
246
+ "Authorization": f"Bearer {HF_API_TOKEN}",
247
+ "Content-Type": "application/json"
248
+ }
249
+
250
  payload = {
251
  "inputs": prompt,
252
  "parameters": {
253
  "max_new_tokens": max_length,
254
+ "temperature": temperature,
255
+ "top_p": 0.95,
256
+ "do_sample": True,
257
+ "return_full_text": False
258
  }
259
  }
260
+
261
+ print(f"πŸ“‘ Calling Hugging Face API...")
262
+ response = requests.post(HF_API_URL, headers=headers, json=payload, timeout=60)
263
+
264
  if response.status_code == 200:
265
  result = response.json()
266
+ generated_text = result[0].get("generated_text", "")
267
+ print(f"βœ… AI response received ({len(generated_text)} chars)")
268
+ return generated_text
269
+ else:
270
+ print(f"❌ HF API error: {response.status_code} - {response.text}")
271
+ return None
272
+
273
+ except requests.exceptions.Timeout:
274
+ print("❌ HF API timeout after 60 seconds")
275
  return None
276
  except Exception as e:
277
+ print(f"❌ HF API error: {str(e)}")
278
  return None
279
 
280
  def generate_questions_with_ai(content: str, difficulty: str, count: int, page_ref: int = None) -> List[Dict]:
281
+ """
282
+ Generate intelligent questions using AI (Hugging Face) with fallback to smart template generation
283
+ """
284
+
285
+ # Limit content length for API
286
+ max_content_length = 3000
287
+ truncated_content = content[:max_content_length]
288
+ if len(content) > max_content_length:
289
+ truncated_content += "\n[Content truncated for length...]"
290
 
291
+ # Build difficulty-specific prompts
292
+ difficulty_instructions = {
293
+ "easy": """
294
+ Generate basic recall and definition questions that test:
295
+ - Key terms and their definitions
296
+ - Simple facts and dates
297
+ - Basic concepts and their characteristics
298
+ - Direct information from the text
299
+
300
+ Question types: short_answer (for definitions/facts), true_false (for simple statements)
301
+ """,
302
+ "medium": """
303
+ Generate conceptual understanding questions that test:
304
+ - Relationships between concepts
305
+ - Cause and effect relationships
306
+ - Comparisons and contrasts
307
+ - Application of concepts to examples
308
+ - Why and how questions
309
+
310
+ Question types: short_answer (for explanations), multiple_choice (for conceptual understanding)
311
+ """,
312
+ "hard": """
313
+ Generate analytical and critical thinking questions that test:
314
+ - Evaluation of arguments or evidence
315
+ - Synthesis of multiple concepts
316
+ - Prediction of outcomes or implications
317
+ - Problem-solving using concepts
318
+ - Critical analysis of assumptions
319
+
320
+ Question types: short_answer (for analysis), multiple_choice (for complex scenarios)
321
+ """
322
  }
323
 
324
+ prompt = f"""You are an expert educator creating high-quality study questions.
325
 
326
+ TEXT CONTENT:
327
+ {truncated_content}
328
 
329
+ INSTRUCTIONS:
330
+ Generate {count} {difficulty}-difficulty level questions based ONLY on the text above.
331
+ {difficulty_instructions.get(difficulty, difficulty_instructions["medium"])}
 
 
 
 
332
 
333
+ FORMAT YOUR RESPONSE AS A JSON ARRAY ONLY, no other text:
334
  [
335
  {{
336
+ "text": "Question text here",
337
+ "type": "short_answer",
338
+ "correct_answer": "Model answer here",
339
+ "explanation": "Brief explanation of why this is correct"
 
340
  }}
341
  ]
342
 
343
+ For multiple_choice questions, use:
344
+ "type": "multiple_choice",
345
+ "options": ["Option A", "Option B", "Option C", "Option D"],
346
+ "correct_answer": "Option A"
347
 
348
+ Generate {count} unique, thoughtful questions now:"""
349
 
350
+ # Try AI generation first
351
+ ai_response = call_hf_api(prompt, 2000, 0.8)
352
 
353
  if ai_response:
354
  try:
355
  # Extract JSON from response
356
+ json_match = re.search(r'\[\s*\{[\s\S]*\}\s*\]', ai_response)
357
  if json_match:
358
  questions_data = json.loads(json_match.group())
359
  questions = []
360
  for i, q_data in enumerate(questions_data[:count]):
361
+ question = {
362
  "id": generate_id(f"q_{i}"),
363
  "question_text": q_data.get("text", ""),
364
  "question_type": q_data.get("type", "short_answer"),
365
  "options": json.dumps(q_data.get("options", [])) if q_data.get("options") else None,
366
+ "correct_answer": q_data.get("correct_answer", "Review the material for this answer."),
367
  "difficulty": difficulty,
368
+ "explanation": q_data.get("explanation", "Review the material for more information."),
369
  "page_reference": page_ref
370
+ }
371
+ if question["question_text"] and len(question["question_text"]) > 10:
372
+ questions.append(question)
373
+
374
  if questions:
375
+ print(f"βœ… AI generated {len(questions)} questions")
376
  return questions
377
+ except json.JSONDecodeError as e:
378
+ print(f"❌ Failed to parse AI response: {str(e)}")
379
+ except Exception as e:
380
+ print(f"❌ Error processing AI response: {str(e)}")
381
 
382
+ # Fallback to smart template generation
383
+ print("πŸ“ Using fallback question generation")
384
  return generate_questions_fallback(content, difficulty, count, page_ref)
385
 
386
  def generate_questions_fallback(content: str, difficulty: str, count: int, page_ref: int = None) -> List[Dict]:
387
+ """
388
+ Smart fallback question generation using NLP techniques
389
+ This creates high-quality questions even without AI
390
+ """
391
 
392
+ # Clean and prepare text
393
+ content = re.sub(r'\s+', ' ', content).strip()
394
+
395
+ # Extract meaningful sentences (longer than 40 chars, not just numbers)
396
  sentences = re.split(r'[.!?]+', content)
397
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 40 and not s.strip().isdigit()]
398
+
399
+ # Extract key terms (capitalized words, long words, numbers)
400
+ key_terms = set()
401
+
402
+ # Find capitalized words (potential proper nouns)
403
+ capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', content)
404
+ key_terms.update(capitalized[:10])
405
 
406
+ # Find long words (potential technical terms)
407
+ long_words = re.findall(r'\b[a-zA-Z]{6,}\b', content)
408
+ long_words = [w for w in long_words if w.lower() not in ['however', 'therefore', 'although', 'especially', 'important', 'different', 'significant']]
409
+ key_terms.update(long_words[:10])
410
 
411
+ # Find numbers and percentages
412
+ numbers = re.findall(r'\b\d+(?:\.\d+)?%?\b|\b\d+(?:,\d+)*(?:th|st|nd|rd)?\b', content)
413
+ key_terms.update(numbers[:5])
414
+
415
+ key_terms = list(key_terms)
416
+
417
+ if not sentences:
418
+ sentences = [content[:200]]
419
 
420
  questions = []
421
 
422
+ # Define question templates based on difficulty
423
+ if difficulty == "easy":
424
+ # Easy: definitions, true/false, fill-in-blank
425
+ for i in range(min(count, len(sentences) + len(key_terms))):
426
+ if i < len(key_terms) and key_terms[i]:
427
+ term = key_terms[i]
428
+ questions.append({
429
+ "id": generate_id(f"q_{i}"),
430
+ "question_text": f"Define or explain the term \"{term}\" in your own words.",
431
+ "question_type": "short_answer",
432
+ "options": None,
433
+ "correct_answer": f"\"{term}\" is an important concept discussed in the material. A good answer should explain its meaning and significance.",
434
+ "difficulty": "easy",
435
+ "explanation": f"Look for where \"{term}\" is introduced and how it's used in context.",
436
+ "page_reference": page_ref
437
+ })
438
+ elif i - len(key_terms) < len(sentences):
439
+ sentence = sentences[i - len(key_terms)]
440
+ # Create a true/false question
441
+ questions.append({
442
+ "id": generate_id(f"q_{i}"),
443
+ "question_text": f"True or False: {sentence[:150]}...",
444
+ "question_type": "true_false",
445
+ "options": None,
446
+ "correct_answer": "True",
447
+ "difficulty": "easy",
448
+ "explanation": "This statement appears in the study material and is presented as fact.",
449
+ "page_reference": page_ref
450
+ })
451
+
452
+ elif difficulty == "medium":
453
+ # Medium: multiple choice, relationship questions
454
+ for i in range(min(count, len(sentences))):
455
+ sentence = sentences[i % len(sentences)]
456
+ concept = key_terms[i % len(key_terms)] if key_terms else "the concept"
457
+
458
  options = [
459
+ f"The material emphasizes {concept} as a key factor",
460
+ f"A minor detail mentioned briefly",
461
+ f"An unrelated example for context",
462
  f"The conclusion drawn from the discussion"
463
  ]
464
+
465
  questions.append({
466
+ "id": generate_id(f"q_{i}"),
467
+ "question_text": f"Based on the text: \"{sentence[:200]}...\" Which of the following best describes the main idea?",
468
  "question_type": "multiple_choice",
469
  "options": json.dumps(options),
470
  "correct_answer": options[0],
471
  "difficulty": "medium",
472
+ "explanation": f"The text focuses on {concept} as the central theme of this passage.",
473
  "page_reference": page_ref
474
  })
475
+
476
+ else: # hard
477
+ # Hard: analysis, application, evaluation
478
+ for i in range(min(count, len(sentences))):
479
+ sentence = sentences[i % len(sentences)]
480
+ concept = key_terms[i % len(key_terms)] if key_terms else "this concept"
481
+
482
+ question_types = [
483
+ f"Analyze the following statement and explain its implications: \"{sentence[:200]}...\"",
484
+ f"How would you apply the concept of {concept} to a real-world situation?",
485
+ f"Evaluate the following claim based on the material: \"{sentence[:150]}...\" Do you agree? Why or why not?",
486
+ f"What are the strengths and weaknesses of the argument presented in: \"{sentence[:150]}...\""
487
+ ]
488
+
489
+ q_text = question_types[i % len(question_types)]
490
+
491
  questions.append({
492
+ "id": generate_id(f"q_{i}"),
493
+ "question_text": q_text,
494
  "question_type": "short_answer",
495
  "options": None,
496
+ "correct_answer": f"This question requires critical thinking. A good answer would demonstrate understanding of {concept} and its broader implications as discussed in the material.",
497
  "difficulty": "hard",
498
+ "explanation": "Consider multiple perspectives, evidence from the text, and potential applications.",
499
  "page_reference": page_ref
500
  })
501
 
502
+ # Ensure we have exactly 'count' questions by duplicating with variations if needed
503
+ while len(questions) < count:
504
+ template = questions[len(questions) % len(questions)].copy()
505
+ template["id"] = generate_id(f"q_{len(questions)}")
506
+ template["question_text"] = template["question_text"] + " (Additional perspective)"
507
+ questions.append(template)
508
+
509
+ print(f"βœ… Generated {len(questions)} fallback questions")
510
+ return questions[:count]
511
+
512
+ def generate_flashcards(content: str, concepts: List[str], count: int = 8) -> List[Dict]:
513
+ """Generate flashcards from key concepts"""
514
+ flashcards = []
515
+ sentences = re.split(r'[.!?]+', content)
516
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 50]
517
+
518
+ for i in range(min(count, len(concepts))):
519
+ concept = concepts[i]
520
+
521
+ # Find context sentence for this concept
522
+ context = ""
523
+ for sentence in sentences:
524
+ if concept.lower() in sentence.lower():
525
+ context = sentence[:150]
526
+ break
527
+
528
+ if not context and i < len(sentences):
529
+ context = sentences[i][:150]
530
+
531
+ flashcards.append({
532
+ "id": generate_id(f"fc_{i}"),
533
+ "front": f"Explain the concept of \"{concept}\" and its significance.",
534
+ "back": f"{context}... This concept is important because it helps understand the overall topic. Review the material for specific details about {concept}.",
535
+ "category": "Key Concept",
536
+ "difficulty": "medium"
537
+ })
538
+
539
+ return flashcards
540
+
541
+ def extract_key_concepts(content: str, max_count: int = 15) -> List[str]:
542
+ """Extract key concepts using NLP techniques"""
543
+ # Clean text
544
+ text = content.lower()
545
+
546
+ # Remove common stop words
547
+ stop_words = {
548
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were',
549
+ 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'but', 'so', 'if', 'then',
550
+ 'else', 'when', 'where', 'which', 'what', 'who', 'whom', 'this', 'that', 'these', 'those', 'it', 'they', 'we',
551
+ 'you', 'he', 'she', 'it', 'them', 'her', 'him', 'us', 'can', 'will', 'would', 'could', 'should', 'may', 'might',
552
+ 'must', 'from', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'using', 'being',
553
+ 'however', 'therefore', 'although', 'especially', 'important', 'different', 'significant'
554
+ }
555
+
556
+ # Extract words and count frequencies
557
+ words = re.findall(r'\b[a-z]{4,}\b', text)
558
+ freq = {}
559
+ for word in words:
560
+ if word not in stop_words:
561
+ freq[word] = freq.get(word, 0) + 1
562
+
563
+ # Extract phrases (2-3 word sequences)
564
+ phrases = re.findall(r'\b[a-z]{3,}\s+[a-z]{3,}\b', text)
565
+ phrase_freq = {}
566
+ for phrase in phrases[:100]:
567
+ if not any(stop in phrase.split() for stop in stop_words):
568
+ phrase_freq[phrase] = phrase_freq.get(phrase, 0) + 1
569
+
570
+ # Get top keywords and phrases
571
+ sorted_words = sorted(freq.items(), key=lambda x: x[1], reverse=True)
572
+ sorted_phrases = sorted(phrase_freq.items(), key=lambda x: x[1], reverse=True)
573
+
574
+ concepts = []
575
+ for word, _ in sorted_words[:max_count]:
576
+ concepts.append(word)
577
+ for phrase, _ in sorted_phrases[:5]:
578
+ if phrase not in concepts:
579
+ concepts.append(phrase)
580
+
581
+ return concepts[:max_count]
582
+
583
+ # ==================== API ENDPOINTS ====================
584
+
585
+ @app.get("/")
586
+ async def serve_frontend():
587
+ """Serve the main frontend page"""
588
+ try:
589
+ with open("index.html", "r", encoding="utf-8") as f:
590
+ return HTMLResponse(content=f.read())
591
+ except FileNotFoundError:
592
+ return HTMLResponse(content="""
593
+ <!DOCTYPE html>
594
+ <html>
595
+ <head><title>StudyFlow AI</title></head>
596
+ <body>
597
+ <h1>StudyFlow AI Backend Running</h1>
598
+ <p>API is operational. Please ensure index.html is in the same directory.</p>
599
+ <p>Available endpoints: /api/user/sessions, /api/session/{id}, /api/process-content</p>
600
+ </body>
601
+ </html>
602
+ """)
603
+
604
+ @app.get("/health")
605
+ async def health_check():
606
+ """Health check endpoint"""
607
+ return {
608
+ "status": "healthy",
609
+ "timestamp": datetime.now().isoformat(),
610
+ "database": DB_PATH,
611
+ "ai_available": bool(HF_API_TOKEN and HF_API_TOKEN != "")
612
+ }
613
 
614
  @app.post("/api/process-content")
615
  async def process_content(
 
619
  content: str = Form(None),
620
  file: UploadFile = File(None),
621
  youtube_url: str = Form(None),
622
+ selected_pages: str = Form(None),
623
+ time_start: float = Form(None),
624
+ time_end: float = Form(None),
625
+ num_questions: int = Form(15)
626
  ):
627
+ """
628
+ Process uploaded content and generate questions
629
+ Supports: text, PDF with page selection, YouTube with time selection
630
+ """
631
+
632
+ print(f"πŸ“ Processing request: type={content_type}, difficulty={difficulty}, title={title}, num_questions={num_questions}")
633
 
634
+ session_id = generate_id("session")
635
  text_content = ""
636
  pages_dict = {}
637
  total_pages = 0
638
  selected_pages_list = []
639
 
640
  try:
641
+ # Handle different content types
642
  if content_type == "text":
643
+ if not content:
644
+ raise HTTPException(status_code=400, detail="No text content provided")
645
+ text_content = content[:50000] # Limit to 50k chars
646
+ print(f"πŸ“„ Text content length: {len(text_content)} chars")
647
 
648
+ elif content_type == "pdf":
649
+ if not file:
650
+ raise HTTPException(status_code=400, detail="No PDF file provided")
651
+
652
+ # Save uploaded file temporarily
653
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
654
  content_bytes = await file.read()
655
  temp_file.write(content_bytes)
656
  temp_file_path = temp_file.name
657
 
658
+ # Extract pages from PDF
659
  pages_dict = extract_text_from_pdf(temp_file_path)
660
  os.unlink(temp_file_path)
661
+
662
  total_pages = len(pages_dict)
663
 
664
  # Parse selected pages
665
  if selected_pages:
666
+ try:
667
+ selected_pages_list = json.loads(selected_pages)
668
+ except:
669
+ selected_pages_list = []
670
+
671
+ # If no pages selected, select all pages with content
672
+ if not selected_pages_list:
673
  selected_pages_list = list(pages_dict.keys())
674
 
675
+ # Combine text from selected pages
676
+ for page_num in sorted(selected_pages_list):
677
  if page_num in pages_dict:
678
  text_content += f"\n--- Page {page_num} ---\n{pages_dict[page_num]}\n"
679
 
680
+ print(f"πŸ“„ PDF: {total_pages} total pages, selected {len(selected_pages_list)} pages, {len(text_content)} chars")
681
+
682
+ elif content_type == "youtube":
683
+ if not youtube_url:
684
+ raise HTTPException(status_code=400, detail="No YouTube URL provided")
685
+
686
+ text_content = extract_text_from_youtube(youtube_url, time_start, time_end)
687
+ if not text_content:
688
+ text_content = f"YouTube video content from: {youtube_url}\n\nNote: Transcript extraction may not be available for all videos."
689
+
690
+ print(f"πŸ“„ YouTube content length: {len(text_content)} chars")
691
+
692
+ else:
693
+ raise HTTPException(status_code=400, detail=f"Invalid content type: {content_type}")
694
 
695
+ # Validate content
696
  if len(text_content) < 100:
697
+ raise HTTPException(status_code=400, detail=f"Content too short ({len(text_content)} chars). Minimum 100 characters required for quality questions.")
698
 
699
  # Generate questions
700
+ questions = generate_questions_with_ai(text_content, difficulty, num_questions)
701
+
702
+ # Extract key concepts for flashcards
703
+ concepts = extract_key_concepts(text_content, 12)
704
+ flashcards = generate_flashcards(text_content, concepts, min(8, num_questions // 2))
705
 
706
  # Save to database
707
  conn = sqlite3.connect(DB_PATH)
708
  cursor = conn.cursor()
709
 
710
  # Save session
711
+ cursor.execute("""
712
+ INSERT INTO sessions (id, title, content_type, difficulty, selected_pages, total_pages, last_accessed)
713
+ VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
714
+ """, (
715
+ session_id, title, content_type, difficulty,
716
+ json.dumps(selected_pages_list) if selected_pages_list else None,
717
+ total_pages
718
+ ))
719
 
720
  # Save pages
721
  for page_num, page_content in pages_dict.items():
722
+ cursor.execute("""
723
+ INSERT INTO pages (id, session_id, page_number, content)
724
+ VALUES (?, ?, ?, ?)
725
+ """, (generate_id("page"), session_id, page_num, page_content[:10000]))
726
 
727
  # Save questions
728
  for q in questions:
729
+ cursor.execute("""
730
+ INSERT INTO questions (id, session_id, question_text, question_type, options, correct_answer, difficulty, explanation, page_reference)
731
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
732
+ """, (
733
+ q["id"], session_id, q["question_text"], q["question_type"],
734
+ q.get("options"), q["correct_answer"], q["difficulty"],
735
+ q.get("explanation", ""), q.get("page_reference")
736
+ ))
737
+
738
+ # Save flashcards
739
+ for fc in flashcards:
740
+ cursor.execute("""
741
+ INSERT INTO flashcards (id, session_id, front, back, category, difficulty)
742
+ VALUES (?, ?, ?, ?, ?, ?)
743
+ """, (fc["id"], session_id, fc["front"], fc["back"], fc["category"], fc.get("difficulty", "medium")))
744
+
745
+ # Update user profile
746
+ cursor.execute("INSERT OR IGNORE INTO user_profile (id) VALUES (1)")
747
+ cursor.execute("UPDATE user_profile SET total_sessions_created = total_sessions_created + 1, last_active = CURRENT_TIMESTAMP WHERE id = 1")
748
 
749
  conn.commit()
750
  conn.close()
751
 
752
+ print(f"βœ… Session created: {session_id} with {len(questions)} questions, {len(flashcards)} flashcards")
753
+
754
  return {
755
+ "success": True,
756
  "session_id": session_id,
757
  "question_count": len(questions),
758
+ "flashcard_count": len(flashcards),
759
  "total_pages": total_pages,
760
  "selected_pages": selected_pages_list
761
  }
 
763
  except HTTPException:
764
  raise
765
  except Exception as e:
766
+ print(f"❌ Error processing content: {str(e)}")
767
+ import traceback
768
+ traceback.print_exc()
769
  raise HTTPException(status_code=500, detail=str(e))
770
 
771
  @app.get("/api/session/{session_id}")
772
  async def get_session(session_id: str):
773
+ """Get complete session data including questions, flashcards, and pages"""
774
+
775
  conn = sqlite3.connect(DB_PATH)
776
  conn.row_factory = sqlite3.Row
777
  cursor = conn.cursor()
778
 
779
+ # Get session info
780
  cursor.execute("SELECT * FROM sessions WHERE id = ?", (session_id,))
781
  session = cursor.fetchone()
782
 
 
784
  conn.close()
785
  raise HTTPException(status_code=404, detail="Session not found")
786
 
787
+ # Update last accessed
788
+ cursor.execute("UPDATE sessions SET last_accessed = CURRENT_TIMESTAMP WHERE id = ?", (session_id,))
789
+
790
+ # Get questions
791
+ cursor.execute("SELECT * FROM questions WHERE session_id = ? ORDER BY created_at", (session_id,))
792
  questions = [dict(row) for row in cursor.fetchall()]
793
 
794
+ # Parse options JSON for multiple choice questions
795
+ for q in questions:
796
+ if q.get("options"):
797
+ try:
798
+ q["options"] = json.loads(q["options"])
799
+ except:
800
+ q["options"] = []
801
+
802
+ # Get flashcards
803
+ cursor.execute("SELECT * FROM flashcards WHERE session_id = ?", (session_id,))
804
+ flashcards = [dict(row) for row in cursor.fetchall()]
805
+
806
+ # Get pages
807
  cursor.execute("SELECT * FROM pages WHERE session_id = ? ORDER BY page_number", (session_id,))
808
  pages = [dict(row) for row in cursor.fetchall()]
809
 
810
+ # Calculate performance metrics
811
  total_questions = len(questions)
812
  correct_answers = sum(1 for q in questions if q.get("is_correct") == 1)
813
  accuracy = round((correct_answers / total_questions * 100) if total_questions > 0 else 0, 1)
814
 
815
+ conn.commit()
816
  conn.close()
817
 
818
  return {
819
  "session": dict(session),
 
820
  "questions": questions,
821
+ "flashcards": flashcards,
822
+ "pages": pages,
823
  "performance": {
824
  "total_questions": total_questions,
825
  "correct_answers": correct_answers,
826
+ "accuracy": accuracy,
827
+ "completion_rate": round((len([q for q in questions if q.get("user_answer")]) / total_questions * 100) if total_questions > 0 else 0, 1)
828
  }
829
  }
830
 
831
  @app.get("/api/user/sessions")
832
  async def get_user_sessions():
833
+ """Get all user sessions with basic stats"""
834
+
835
  conn = sqlite3.connect(DB_PATH)
836
  conn.row_factory = sqlite3.Row
837
  cursor = conn.cursor()
 
839
  cursor.execute("SELECT * FROM sessions ORDER BY last_accessed DESC")
840
  sessions = [dict(row) for row in cursor.fetchall()]
841
 
842
+ # Add question count and accuracy to each session
843
+ for session in sessions:
844
+ cursor.execute("SELECT COUNT(*), SUM(is_correct) FROM questions WHERE session_id = ?", (session["id"],))
845
+ result = cursor.fetchone()
846
+ total = result[0] or 0
847
+ correct = result[1] or 0
848
+ accuracy = round((correct / total * 100) if total > 0 else 0, 1)
849
+
850
+ session["question_count"] = total
851
+ session["accuracy"] = accuracy
852
+
853
  conn.close()
854
+
855
  return {"sessions": sessions}
856
 
857
  @app.post("/api/submit-answer")
 
861
  user_answer: str = Form(...),
862
  time_spent: int = Form(0)
863
  ):
864
+ """Submit and evaluate an answer"""
865
+
866
  conn = sqlite3.connect(DB_PATH)
867
  cursor = conn.cursor()
868
 
869
+ # Get question details
870
  cursor.execute("SELECT correct_answer, question_type FROM questions WHERE id = ? AND session_id = ?",
871
  (question_id, session_id))
872
  result = cursor.fetchone()
 
878
  correct_answer = result[0]
879
  question_type = result[1]
880
 
881
+ # Evaluate based on question type
882
  is_correct = 0
883
+
884
  if question_type == "multiple_choice":
885
+ # Exact match for multiple choice
886
  is_correct = 1 if user_answer.strip() == correct_answer.strip() else 0
887
+
888
  elif question_type == "true_false":
889
+ # Case-insensitive match for true/false
890
  is_correct = 1 if user_answer.strip().lower() == correct_answer.strip().lower() else 0
891
+
892
+ elif question_type == "fill_blank":
893
+ # Flexible matching for fill in blank
894
+ user_clean = user_answer.strip().lower()
895
+ correct_clean = correct_answer.strip().lower()
896
+ is_correct = 1 if (user_clean == correct_clean or correct_clean in user_clean or user_clean in correct_clean) else 0
897
+
898
+ else: # short_answer
899
  # Smart evaluation for short answers
900
+ user_clean = user_answer.strip().lower()
901
+ correct_clean = correct_answer.strip().lower()
902
+
903
+ # Extract key words from correct answer
904
+ key_words = re.findall(r'\b[a-z]{4,}\b', correct_clean)
905
+ key_words = [w for w in key_words if w not in ['this', 'that', 'these', 'those', 'there', 'their', 'would', 'could', 'should']]
906
+
907
+ if key_words:
908
+ # Count how many key words appear in user answer
909
+ matches = sum(1 for kw in key_words if kw in user_clean)
910
+ is_correct = 1 if matches >= len(key_words) * 0.4 else 0
911
+ else:
912
+ # Fallback: check length and similarity
913
+ is_correct = 1 if len(user_clean) > 30 or user_clean in correct_clean or correct_clean in user_clean else 0
914
 
915
+ # Update database
916
+ cursor.execute("""
917
+ UPDATE questions
918
+ SET user_answer = ?, is_correct = ?, time_spent = ?
919
+ WHERE id = ? AND session_id = ?
920
+ """, (user_answer, is_correct, time_spent, question_id, session_id))
921
+
922
+ # Update user profile
923
+ cursor.execute("UPDATE user_profile SET total_questions_answered = total_questions_answered + 1, total_correct_answers = total_correct_answers + ? WHERE id = 1", (is_correct,))
924
 
925
  conn.commit()
926
  conn.close()
 
928
  return {
929
  "is_correct": bool(is_correct),
930
  "correct_answer": correct_answer,
931
+ "feedback": "Correct! Great job!" if is_correct else f"The correct answer is: {correct_answer[:200]}"
932
  }
933
 
934
+ @app.delete("/api/session/{session_id}")
935
+ async def delete_session(session_id: str):
936
+ """Delete a session and all associated data"""
937
+
938
+ conn = sqlite3.connect(DB_PATH)
939
+ cursor = conn.cursor()
940
+
941
+ # Check if session exists
942
+ cursor.execute("SELECT id FROM sessions WHERE id = ?", (session_id,))
943
+ if not cursor.fetchone():
944
+ conn.close()
945
+ raise HTTPException(status_code=404, detail="Session not found")
946
+
947
+ # Delete session (cascade will delete questions, flashcards, pages)
948
+ cursor.execute("DELETE FROM sessions WHERE id = ?", (session_id,))
949
+
950
+ conn.commit()
951
+ affected = cursor.rowcount
952
+ conn.close()
953
+
954
+ return {"message": "Session deleted successfully", "affected": affected}
955
+
956
+ @app.post("/api/save-note")
957
+ async def save_note(
958
+ session_id: str = Form(...),
959
+ title: str = Form(...),
960
+ content: str = Form(...),
961
+ note_id: str = Form(None)
962
+ ):
963
+ """Save or update a note for a session"""
964
+
965
+ conn = sqlite3.connect(DB_PATH)
966
+ cursor = conn.cursor()
967
+
968
+ if note_id:
969
+ # Update existing note
970
+ cursor.execute("""
971
+ UPDATE notes SET title = ?, content = ?, updated_at = CURRENT_TIMESTAMP
972
+ WHERE id = ? AND session_id = ?
973
+ """, (title, content, note_id, session_id))
974
+ else:
975
+ # Create new note
976
+ note_id = generate_id("note")
977
+ cursor.execute("""
978
+ INSERT INTO notes (id, session_id, title, content)
979
+ VALUES (?, ?, ?, ?)
980
+ """, (note_id, session_id, title, content))
981
+
982
+ conn.commit()
983
+ conn.close()
984
+
985
+ return {"success": True, "note_id": note_id}
986
+
987
+ @app.get("/api/user/profile")
988
+ async def get_user_profile():
989
+ """Get user profile with statistics"""
990
+
991
+ conn = sqlite3.connect(DB_PATH)
992
+ conn.row_factory = sqlite3.Row
993
+ cursor = conn.cursor()
994
+
995
+ cursor.execute("SELECT * FROM user_profile WHERE id = 1")
996
+ profile = cursor.fetchone()
997
+
998
+ if not profile:
999
+ profile = {
1000
+ "total_questions_answered": 0,
1001
+ "total_correct_answers": 0,
1002
+ "total_study_time": 0,
1003
+ "total_sessions_created": 0
1004
+ }
1005
+ else:
1006
+ profile = dict(profile)
1007
+
1008
+ # Calculate overall accuracy
1009
+ total = profile.get("total_questions_answered", 0)
1010
+ correct = profile.get("total_correct_answers", 0)
1011
+ accuracy = round((correct / total * 100) if total > 0 else 0, 1)
1012
+
1013
+ conn.close()
1014
+
1015
+ return {
1016
+ "profile": profile,
1017
+ "accuracy": accuracy,
1018
+ "streak": 0, # Would need additional logic for streak
1019
+ "total_study_minutes": profile.get("total_study_time", 0) // 60
1020
+ }
1021
+
1022
+ @app.post("/api/update-study-time")
1023
+ async def update_study_time(
1024
+ session_id: str = Form(...),
1025
+ time_spent: int = Form(0)
1026
+ ):
1027
+ """Update total study time"""
1028
+
1029
+ conn = sqlite3.connect(DB_PATH)
1030
+ cursor = conn.cursor()
1031
+
1032
+ cursor.execute("UPDATE user_profile SET total_study_time = total_study_time + ? WHERE id = 1", (time_spent,))
1033
+ cursor.execute("UPDATE sessions SET last_accessed = CURRENT_TIMESTAMP WHERE id = ?", (session_id,))
1034
+
1035
+ conn.commit()
1036
+ conn.close()
1037
+
1038
+ return {"success": True}
1039
+
1040
+ # ==================== MAIN ENTRY POINT ====================
1041
 
1042
  if __name__ == "__main__":
1043
  import uvicorn
1044
+
1045
+ print("=" * 60)
1046
+ print("πŸš€ StudyFlow AI Backend Server")
1047
+ print("=" * 60)
1048
+ print(f"πŸ“ Database: {DB_PATH}")
1049
+ print(f"πŸ€– AI Available: {bool(HF_API_TOKEN and HF_API_TOKEN != '')}")
1050
+ if HF_API_TOKEN:
1051
+ print(f"πŸ”‘ HF API Token: {HF_API_TOKEN[:10]}...")
1052
+ else:
1053
+ print("⚠️ No HF API Token - using fallback question generation")
1054
+ print(" Get a free token at: https://huggingface.co/settings/tokens")
1055
+ print("=" * 60)
1056
+ print("🌐 Server starting at: http://0.0.0.0:7860")
1057
+ print("πŸ“– API Docs: http://0.0.0.0:7860/docs")
1058
+ print("=" * 60)
1059
+
1060
  uvicorn.run(app, host="0.0.0.0", port=7860)