Prof-Reza commited on
Commit
06825b1
·
verified ·
1 Parent(s): dd22541

Use JSON schema for course outline; implement structured plan generation; add schema file; update planner to produce JSON; update app to write JSON and doc attachments; update requirements and searcher for PDF extraction and unify dependencies.

Browse files

This update introduces a JSON schema for course outlines and modifies the planner and app to generate structured course plans that follow this schema. A new file 'course_outline_schema.json' defines the outline structure. The planner now reads this schema and instructs the LLM to output valid JSON matching it. The finalize function writes the JSON outline and a Word document to attachments. Searcher gains PDF extraction support and requirements.txt is updated to include PyPDF2 and unify dependencies.

Files changed (4) hide show
  1. app.py +25 -8
  2. course_outline_schema.json +46 -0
  3. planner.py +65 -19
  4. searcher.py +38 -0
app.py CHANGED
@@ -485,15 +485,29 @@ def finalize_and_doc(chat_history, chat_pairs, sources, plan, chat_key):
485
  chat_history = []
486
  if sources is None:
487
  sources = []
488
- # Generate the course plan text
 
489
  try:
490
- plan_text = plan_course(chat_history, sources)
491
  except Exception as e:
492
- plan_text = (
493
- "An error occurred while generating the course outline. Please ensure your API keys are configured.\n"
494
- f"(Error: {e})"
495
  )
496
- # Create a Word document from the plan text and sources
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  try:
498
  doc_path = outline_to_docx("Course Outline", plan_text, references=sources)
499
  except Exception as e:
@@ -506,10 +520,13 @@ def finalize_and_doc(chat_history, chat_pairs, sources, plan, chat_key):
506
  with open(tmp_path, "w") as f:
507
  f.write(err_msg)
508
  doc_path = tmp_path
509
- # Record the generated document as an attachment tied to this chat
510
  if chat_key:
511
  try:
512
- add_attachment(chat_key, doc_path, os.path.basename(doc_path))
 
 
 
513
  except Exception:
514
  pass
515
  # Fetch updated attachment list
 
485
  chat_history = []
486
  if sources is None:
487
  sources = []
488
+ import json
489
+ # Generate the course plan as structured JSON using the planner
490
  try:
491
+ json_string = plan_course(chat_history, sources)
492
  except Exception as e:
493
+ json_string = (
494
+ "{\n \"error\": \"An error occurred while generating the course outline.\",\n"
495
+ f" \"details\": \"{str(e).replace('"', '\\"')}\"\n}}"
496
  )
497
+ # Attempt to parse the JSON to ensure it is valid; if it fails, wrap as raw string
498
+ try:
499
+ parsed = json.loads(json_string)
500
+ except Exception:
501
+ parsed = None
502
+ plan_text = json_string
503
+ # Write the JSON outline to a file for download
504
+ json_path = "/tmp/course_outline.json"
505
+ try:
506
+ with open(json_path, "w") as jf:
507
+ jf.write(json_string)
508
+ except Exception:
509
+ json_path = None
510
+ # Create a Word document from the JSON string; we simply embed the JSON as text into the document
511
  try:
512
  doc_path = outline_to_docx("Course Outline", plan_text, references=sources)
513
  except Exception as e:
 
520
  with open(tmp_path, "w") as f:
521
  f.write(err_msg)
522
  doc_path = tmp_path
523
+ # Record the generated JSON and document as attachments
524
  if chat_key:
525
  try:
526
+ if json_path:
527
+ add_attachment(chat_key, json_path, os.path.basename(json_path))
528
+ if doc_path:
529
+ add_attachment(chat_key, doc_path, os.path.basename(doc_path))
530
  except Exception:
531
  pass
532
  # Fetch updated attachment list
course_outline_schema.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "title": "",
4
+ "tagline": "",
5
+ "description": "",
6
+ "duration": "",
7
+ "level": "",
8
+ "audience": "",
9
+ "prerequisites": "",
10
+ "main_outcome": "",
11
+ "learning_objectives": [],
12
+ "key_takeaways": [],
13
+ "skills": [],
14
+ "seo_keywords": [],
15
+ "real_world_connections": "",
16
+ "proof_of_learning": "",
17
+ "tools": [
18
+ {
19
+ "name": "",
20
+ "description": "",
21
+ "url": ""
22
+ }
23
+ ],
24
+ "course_plan": [
25
+ {
26
+ "module_title": "",
27
+ "lessons": [
28
+ {
29
+ "lesson_title": "",
30
+ "items": [
31
+ {
32
+ "type": "",
33
+ "title": "",
34
+ "description": ""
35
+ }
36
+ ]
37
+ }
38
+ ]
39
+ }
40
+ ],
41
+ "capstone_project": "",
42
+ "readings": [],
43
+ "recommended_next_steps": [],
44
+ "references": [],
45
+ "attachments": []
46
+ }
planner.py CHANGED
@@ -3,64 +3,110 @@ import openai
3
 
4
 
5
  def plan_course(messages, sources):
6
- """Use OpenAI to plan a course based on messages and sources. Tries to handle different OpenAI SDK versions."""
7
- # Ensure API key is available
8
- # Support alternative secret name COURSECREATOR_API_KEY as a fallback for the OpenAI API key
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  api_key = os.getenv("OPENAI_API_KEY") or os.getenv("COURSECREATOR_API_KEY")
10
  if not api_key:
11
  raise ValueError(
12
  "An OpenAI API key is required to plan the course (set OPENAI_API_KEY or COURSECREATOR_API_KEY)"
13
  )
 
 
 
 
 
 
 
 
 
 
 
 
14
  system_prompt = (
15
- "You are an expert course planner. Use the conversation and sources to propose a structured plan."
 
 
 
 
16
  )
17
- formatted_messages = [{"role": "system", "content": system_prompt}]
 
 
 
 
18
  for msg in messages:
19
  formatted_messages.append(msg)
20
- # Use a widely supported default model; older OpenAI SDKs may not recognise newer names
 
 
 
 
 
 
 
 
 
 
 
21
  model = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
22
- temperature = float(os.getenv("TEMPERATURE", "0.7"))
23
- max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "2048"))
24
- # Try to call OpenAI using v1-style client if available
25
  try:
26
- # Newer OpenAI Python SDK (>=1.0) exposes `OpenAI` client
27
  if hasattr(openai, "OpenAI"):
28
  client = openai.OpenAI(api_key=api_key)
29
- # Try with max_tokens; fall back to max_completion_tokens if unsupported
30
  try:
31
- response = client.chat.completions.create(
32
  model=model,
33
  messages=formatted_messages,
34
  temperature=temperature,
35
  max_tokens=max_tokens,
36
  )
37
  except Exception:
38
- response = client.chat.completions.create(
 
39
  model=model,
40
  messages=formatted_messages,
41
  temperature=temperature,
42
  max_completion_tokens=max_tokens,
43
  )
44
- plan_text = response.choices[0].message.content
45
  else:
46
  # Legacy OpenAI SDK (<1.0)
47
  openai.api_key = api_key
48
  try:
49
- response = openai.ChatCompletion.create(
50
  model=model,
51
  messages=formatted_messages,
52
  temperature=temperature,
53
  max_tokens=max_tokens,
54
  )
55
  except Exception:
56
- response = openai.ChatCompletion.create(
57
  model=model,
58
  messages=formatted_messages,
59
  temperature=temperature,
60
  max_completion_tokens=max_tokens,
61
  )
62
- plan_text = response["choices"][0]["message"]["content"]
63
  except Exception as e:
64
- # Propagate error for caller to handle
65
  raise RuntimeError(f"OpenAI API error: {e}")
66
- return plan_text
 
 
3
 
4
 
5
  def plan_course(messages, sources):
6
+ """
7
+ Generate a structured course outline as a JSON object using the conversation and collected sources.
8
+
9
+ This function reads a JSON schema from the repository (``course_outline_schema.json``) and instructs
10
+ the language model to produce an output that strictly follows the schema. The conversation history
11
+ (``messages``) and list of resources (``sources``) are provided to the model as context.
12
+
13
+ Args:
14
+ messages (list[dict]): Conversation history with roles and content.
15
+ sources (list[dict]): List of source dictionaries with "title" and "url" keys.
16
+
17
+ Returns:
18
+ str: A JSON string representing the course outline that matches the schema.
19
+
20
+ Raises:
21
+ RuntimeError: If the OpenAI API call fails.
22
+ ValueError: If an API key is not provided via environment variables.
23
+ """
24
+ # Ensure API key is available (support COURSECREATOR_API_KEY as fallback)
25
  api_key = os.getenv("OPENAI_API_KEY") or os.getenv("COURSECREATOR_API_KEY")
26
  if not api_key:
27
  raise ValueError(
28
  "An OpenAI API key is required to plan the course (set OPENAI_API_KEY or COURSECREATOR_API_KEY)"
29
  )
30
+ # Load the JSON schema from the local file to guide the model
31
+ schema_path = os.path.join(os.path.dirname(__file__) or ".", "course_outline_schema.json")
32
+ try:
33
+ with open(schema_path, "r") as f:
34
+ schema_content = f.read().strip()
35
+ except Exception:
36
+ # If the schema is not found, define a minimal fallback structure
37
+ schema_content = (
38
+ '{"title":"","description":"","course_plan":[]}'
39
+ )
40
+ # Compose system prompt: instruct the model to output JSON matching the schema and to use
41
+ # information from the conversation and the provided sources.
42
  system_prompt = (
43
+ "You are an expert course planner. Use the conversation and sources provided to produce a "
44
+ "detailed course outline. Your response MUST be a valid JSON object that strictly follows "
45
+ "this schema:\n\n"
46
+ f"{schema_content}\n\n"
47
+ "Do not wrap your answer in markdown or include any additional commentary. Only output the JSON."
48
  )
49
+ # Build messages array for the model: include system prompt, conversation, and a description of sources
50
+ formatted_messages = [
51
+ {"role": "system", "content": system_prompt},
52
+ ]
53
+ # Include the conversation history
54
  for msg in messages:
55
  formatted_messages.append(msg)
56
+ # Append sources description if present
57
+ if sources:
58
+ # Format sources as a numbered list for the model to reference
59
+ source_lines = []
60
+ for i, src in enumerate(sources, start=1):
61
+ if isinstance(src, dict):
62
+ t = src.get("title", "")
63
+ u = src.get("url", "")
64
+ source_lines.append(f"[{i}] {t} - {u}")
65
+ source_text = "\n".join(source_lines)
66
+ formatted_messages.append({"role": "system", "content": f"Sources:\n{source_text}"})
67
+ # Model configuration
68
  model = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
69
+ temperature = float(os.getenv("TEMPERATURE", "0.3")) # Lower temperature for more deterministic JSON
70
+ max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "4096"))
 
71
  try:
72
+ # Use new OpenAI client if available
73
  if hasattr(openai, "OpenAI"):
74
  client = openai.OpenAI(api_key=api_key)
 
75
  try:
76
+ resp = client.chat.completions.create(
77
  model=model,
78
  messages=formatted_messages,
79
  temperature=temperature,
80
  max_tokens=max_tokens,
81
  )
82
  except Exception:
83
+ # Fallback to max_completion_tokens if model requires it
84
+ resp = client.chat.completions.create(
85
  model=model,
86
  messages=formatted_messages,
87
  temperature=temperature,
88
  max_completion_tokens=max_tokens,
89
  )
90
+ content = resp.choices[0].message.content
91
  else:
92
  # Legacy OpenAI SDK (<1.0)
93
  openai.api_key = api_key
94
  try:
95
+ resp = openai.ChatCompletion.create(
96
  model=model,
97
  messages=formatted_messages,
98
  temperature=temperature,
99
  max_tokens=max_tokens,
100
  )
101
  except Exception:
102
+ resp = openai.ChatCompletion.create(
103
  model=model,
104
  messages=formatted_messages,
105
  temperature=temperature,
106
  max_completion_tokens=max_tokens,
107
  )
108
+ content = resp["choices"][0]["message"]["content"]
109
  except Exception as e:
 
110
  raise RuntimeError(f"OpenAI API error: {e}")
111
+ # The content should be valid JSON. Return as string so the caller can write to file or parse.
112
+ return content
searcher.py CHANGED
@@ -38,6 +38,14 @@ def run_web_search(query, num_results=5, domain_filter=""):
38
  import re
39
  from typing import List, Dict, Optional
40
 
 
 
 
 
 
 
 
 
41
  # Import DB helpers from sibling module. Note: db.py resides in the same package directory.
42
  from db import get_resource, upsert_resource
43
 
@@ -103,6 +111,36 @@ def fetch_and_extract(url: str, timeout: int = 15) -> Optional[Dict]:
103
  resp.raise_for_status()
104
  except Exception:
105
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # Parse HTML
107
  soup = BeautifulSoup(resp.text, "html.parser")
108
  # Title: fall back to URL if missing
 
38
  import re
39
  from typing import List, Dict, Optional
40
 
41
+ # Additional imports for PDF extraction
42
+ import io
43
+ try:
44
+ from PyPDF2 import PdfReader # type: ignore
45
+ except ImportError:
46
+ # PyPDF2 will be installed via requirements; if missing, pdf extraction will be disabled
47
+ PdfReader = None
48
+
49
  # Import DB helpers from sibling module. Note: db.py resides in the same package directory.
50
  from db import get_resource, upsert_resource
51
 
 
111
  resp.raise_for_status()
112
  except Exception:
113
  return None
114
+ # If the response is a PDF (by content type or URL), attempt to extract text using PyPDF2
115
+ content_type = resp.headers.get("Content-Type", "").lower()
116
+ if (content_type.startswith("application/pdf") or url.lower().endswith(".pdf")) and PdfReader is not None:
117
+ try:
118
+ # Read PDF content
119
+ pdf_stream = io.BytesIO(resp.content)
120
+ reader = PdfReader(pdf_stream)
121
+ all_text = ""
122
+ for page in reader.pages:
123
+ try:
124
+ text = page.extract_text() or ""
125
+ except Exception:
126
+ text = ""
127
+ all_text += text + "\n"
128
+ if not all_text.strip():
129
+ return None
130
+ excerpt = all_text[:2000]
131
+ # Use the URL as the title for PDFs
132
+ title = url
133
+ # Determine domain
134
+ try:
135
+ from urllib.parse import urlparse
136
+ domain = urlparse(url).netloc
137
+ except Exception:
138
+ domain = ""
139
+ upsert_resource(url, title, domain, excerpt, meta={"length": len(all_text), "pdf": True})
140
+ return get_resource(url)
141
+ except Exception:
142
+ # If PDF extraction fails, continue with HTML extraction
143
+ pass
144
  # Parse HTML
145
  soup = BeautifulSoup(resp.text, "html.parser")
146
  # Title: fall back to URL if missing