record-ocr / watch.py
jehyn923's picture
Update watch.py
9906f58 verified
import json
import os
from bs4 import BeautifulSoup
import re
from typing import List, Dict
from pydantic import BaseModel
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from langchain_upstage import UpstageDocumentParseLoader
openai_api_key = os.environ["OPENAI_API_KEY"]
gpt_4o_mini = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=openai_api_key)
gpt_4o = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
creative_experiential_activities_prompt = """
<instructions>
์•„๋ž˜ <creative-experiential-activities-html>์„ <creative-experiential-activities-json>ํ˜•์‹์œผ๋กœ ๋งคํ•‘ํ•ด์„œ ์‘๋‹ต์œผ๋กœ ์ „๋‹ฌํ•ด์ค˜.
์›๋ฌธ์˜ ๋ˆ„๋ฝ์ด ์žˆ์–ด์„  ์•ˆ๋˜๊ณ , ๊ฐ€๋” ์˜คํƒ€๊ฐ€ ์žˆ๊ฑฐ๋‚˜ ๋ฌธ๋งฅ์— ๋งž์ง€ ์•Š์€ ๋„์–ด์“ฐ๊ธฐ๊ฐ€ ์žˆ๋‹ค๋ฉด ๋„ˆ์˜ ํŒ๋‹จ์— ์ˆ˜์ •ํ•ด์„œ ์ „๋‹ฌํ•ด์ค˜.
๋‹ค์‹œํ•œ๋ฒˆ ๋งํ•˜์ง€๋งŒ ์ ˆ๋Œ€๋กœ ์‘๋‹ต์— ๊ฐ์ƒ‰์ด๋‚˜ ๋ˆ„๋ฝ์ด ์žˆ์œผ๋ฉด ์•ˆ๋ผ.
areaOfInterest๋Š” ํฌ๋ง๋ถ„์•ผ๋ฅผ ์˜๋ฏธํ•˜๋ฉฐ ์ง„๋กœํ™œ๋™์—๋งŒ ๋“ค์–ด๊ฐ€.
areaOfInterest๋Š” ๊ทธ๋ฆฌ๊ณ  ํฌ๋ง๋ถ„์•ผ๋ผ๋Š” ๋‹จ์–ด๋ฅผ ๋บ€ ๋‚ด์šฉ๋“ค๋งŒ ๋“ค์–ด๊ฐ€๋ฉด ๋ผ
areaOfInterest๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ์—” ๋นˆ ๋ฌธ์ž์—ด๋กœ ์ฒ˜๋ฆฌํ•ด์ค˜.
output์€ json๋งŒ ์ „๋‹ฌํ•ด์ค˜.
</instructions>
<creative-experiential-activities-html>
{creative-experiential-activities-html}
</creative-experiential-activities-html>
<creative-experiential-activities-json>
{{
//์ฐฝ์˜์  ์ฒดํ—˜ํ™œ๋™์‚ฌํ•ญ
"creativeExperientialActivities": [
{{
//์˜์—ญ
"activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER
//ํ•™๋…„
"grade": "1", (integer)
//์‹œ๊ฐ„
"hours": "", (integer)
//ํŠน๊ธฐ์‚ฌํ•ญ
"specialNotes": "", (string)
//ํฌ๋ง๋ถ„์•ผ
"areaOfInterest": "" (string)
}},
{{
"activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER
"grade": "1", //ํ•™๋…„
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER
"grade": "1", //ํ•™๋…„
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER
"grade": "2", //ํ•™๋…„
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER
"grade": "2", //ํ•™๋…„
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER
"grade": "2", //ํ•™๋…„
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER
"grade": 3, //ํ•™๋…„
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER
"grade": "3", //ํ•™๋…„
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER
"grade": "3", //ํ•™๋…„
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
],
}}
</creative-experiential-activities-json>
"""
academic_achievement_prompt = """
<instructions>
<instructions>
์•„๋ž˜ <academic-achievement-html>์„ <academic-achievement-json>ํ˜•์‹์œผ๋กœ ๋งคํ•‘ํ•ด์„œ ์‘๋‹ต์œผ๋กœ ์ „๋‹ฌํ•ด์ค˜.
ํ•™๋…„๊ณผ course type์€๋Š” <grade-course-type>์— ๋‚˜์™€์žˆ์„ ์˜ˆ์ •์ด์•ผ.
academic-achievement-html์˜ ์„ธ๋ถ€ ๋Šฅ๋ ฅ ๋ฐ ํŠน๊ธฐ์‚ฌํ•ญ์—” ๊ต๊ณผ, ๊ณผ๋ชฉ์— ๋Œ€ํ•œ ํ•™์ƒ์˜ ์ƒ์„ธ ์„ฑ์ทจ ๋ฐ ํŠน๊ธฐ์‚ฌํ•ญ์ด ๋“ค์–ด๊ฐ€๊ณ  ์žˆ์–ด.
์„ธ๋ถ€๋Šฅ๋ ฅ ๋ฐ ํŠน๊ธฐ์‚ฌํ•ญ ์•„๋ž˜์˜ ๋‚ด์šฉ์€ ์•„๋ž˜์ฒ˜๋Ÿผ ๊ตฌ์„ฑ๋˜์–ด ์žˆ์œผ๋‹ˆ๊นŒ ์ž˜ ์ฐธ๊ณ ํ•ด์„œ ๊ต๊ณผ๋ชฉ๋ณ„๋กœ ์ž˜ ํ• ๋‹นํ•ด์„œ ๋งคํ•‘ํ•ด์ค˜
์„ธ๋ถ€๋Šฅ๋ ฅ ๋ฐ ํŠน๊ธฐ์‚ฌํ•ญ์„ ์•„์ฃผ ๊ผผ๊ผผํžˆ ํ™•์ธํ•ด์„œ ์‘๋‹ตํ•ด์ค˜.
- ๊ต๊ณผ๋ชฉ : ํŠน๊ธฐ์‚ฌํ•ญ ๋ฐ ์„ฑ์ทจ๋„
- ๊ต๊ณผ๋ชฉ์— ํ•™๊ธฐ๊ฐ€ ๋‚˜๋ˆ ์ ธ์„œ ์ž‘์„ฑ๋˜์–ด ์žˆ๋Š” ๊ฒฝ์šฐ์—” ๊ทธ์— ๋งž๊ฒŒ ๋งคํ•‘ํ•ด์ฃผ๋ฉด ๋ผ.
- ๋งŒ์•ฝ ๊ต๊ณผ๋ชฉ์— ํ•™๊ธฐ๊ฐ€ ๋ช…ํ™•ํžˆ ๋‚˜๋ˆ ์ ธ์žˆ๊ฑฐ๋‚˜ ๋ช…์‹œ๋˜์–ด ์žˆ์ง€ ์•Š๊ณ , 1ํ•™๊ธฐ 2ํ•™๊ธฐ ๋ชจ๋‘์— ํ•ด๋‹น ๊ต๊ณผ๋ชฉ์ด ์žˆ๋Š” ๊ฒฝ์šฐ์—” 1ํ•™๊ธฐ์—๋งŒ ๋งคํ•‘ํ•ด์ฃผ๊ณ  2ํ•™๊ธฐ๋Š” ๋นˆ์นธ์œผ๋กœ ์ „๋‹ฌํ•ด์ค˜.
์›๋ฌธ์˜ ๋ˆ„๋ฝ์ด ์žˆ์–ด์„  ์•ˆ๋˜๊ณ , ๊ฐ€๋” ์˜คํƒ€๊ฐ€ ์žˆ๊ฑฐ๋‚˜ ๋ฌธ๋งฅ์— ๋งž์ง€ ์•Š์€ ๋„์–ด์“ฐ๊ธฐ๊ฐ€ ์žˆ๋‹ค๋ฉด ๋„ˆ์˜ ํŒ๋‹จ์— ์ˆ˜์ •ํ•ด์„œ ์ „๋‹ฌํ•ด์ค˜.
๋‹ค์‹œํ•œ๋ฒˆ ๋งํ•˜์ง€๋งŒ ์ ˆ๋Œ€๋กœ ์‘๋‹ต์— ๊ฐ์ƒ‰์ด๋‚˜ ๋ˆ„๋ฝ์ด ์žˆ์œผ๋ฉด ์•ˆ๋ผ.
ํ™•์ธํ•ด๋ณด๊ณ  ์ „๋‹ฌํ•  ์‘๋‹ต๊ฐ’์ด ์—†๋‹ค๋ฉด ๋นˆ ์‘๋‹ต๊ฐ’์„ ์ค˜๋„ ๊ดœ์ฐฎ์•„.
output์€ json๋งŒ ์ „๋‹ฌํ•ด์ค˜.
</instructions>
<grade-course-type>
{grade-course-type}
</grade-course-type>
<academic-achievement-html>
{academic-achievement-html}
</academic-achievement-html>
<academic-achievement-json>
{{
//๊ต๊ณผํ•™์Šต๋ฐœ๋‹ฌ์ƒํ™ฉ
"academicAchievement": [
{{
// ๊ธฐ๋ณธ, ์ง„๋กœ์„ ํƒ๊ณผ๋ชฉ, ์ฒด์œก, ์˜ˆ์ˆ  ๋“ฑ ๊ต๊ณผ์˜ type
"courseType": "BASIC", //BASIC(๊ธฐ๋ณธ), CAREER(์ง„๋กœ ์„ ํƒ ๊ณผ๋ชฉ), PE_ARTS(์ฒด์œก, ์˜ˆ์ˆ )
"grade": 1, //ํ•™๋…„
"semester": 1, //ํ•™๊ธฐ either 1 or 2
"subject": "๊ตญ์–ด", // ๊ต๊ณผ
"course": "๊ตญ์–ด", // ๊ณผ๋ชฉ
//์„ธ๋ถ€๋Šฅ๋ ฅ ๋ฐ ํŠน๊ธฐ์‚ฌํ•ญ ํ‘œ์•„๋ž˜์— ๊ฐ ๊ณผ๋ชฉ ์•„๋ž˜ ๋‚ด์šฉ
"detailedAbilities": "" //์„ธ๋ถ€๋Šฅ๋ ฅ๋ฐํŠน๊ธฐ์‚ฌํ•ญ ๊ณผ๋ชฉ๋ณ„ ๋ถ„๋ฆฌ
}},
{{
"courseType": "BASIC", //BASIC(๊ธฐ๋ณธ), CAREER(์ง„๋กœ ์„ ํƒ ๊ณผ๋ชฉ), PE_ARTS(์ฒด์œก, ์˜ˆ์ˆ )
"grade": 1, //ํ•™๋…„
"semester": 1, //ํ•™๊ธฐ
"subject": "์ˆ˜ํ•™", // ๊ต๊ณผ
"course": "์ˆ˜ํ•™", // ๊ณผ๋ชฉ
"detailedAbilities": "" //์„ธ๋ถ€๋Šฅ๋ ฅ๋ฐํŠน๊ธฐ์‚ฌํ•ญ ๊ณผ๋ชฉ๋ณ„ ๋ถ„๋ฆฌ
}}, ...
{{
"courseType": "BASIC", //BASIC(๊ธฐ๋ณธ), CAREER(์ง„๋กœ ์„ ํƒ ๊ณผ๋ชฉ), PE_ARTS(์ฒด์œก, ์˜ˆ์ˆ )
"grade": 1, //ํ•™๋…„
"semester": 2, //ํ•™๊ธฐ
"subject": "๊ตญ์–ด", // ๊ต๊ณผ
"course": "์ˆ˜ํ•™", // ๊ณผ๋ชฉ
"detailedAbilities": "" //์„ธ๋ถ€๋Šฅ๋ ฅ๋ฐํŠน๊ธฐ์‚ฌํ•ญ ๊ณผ๋ชฉ๋ณ„ ๋ถ„๋ฆฌ
}},
],
}}
</academic-achievement-json>
"""
class HTMLParser:
@staticmethod
def parse(html_content: str) -> BeautifulSoup:
return BeautifulSoup(html_content, 'html.parser')
class ChapterExtractor:
def __init__(self, soup: BeautifulSoup, desired_keys: List[str]):
self.soup = soup
self.desired_keys = desired_keys
def extract(self) -> Dict[str, str]:
result_dict = {}
current_key = self.desired_keys[0]
paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'})
for tag in paragraph_tags:
key = tag.decode_contents().strip()
if key in self.desired_keys:
current_key = key
value = self._extract_value(tag)
result_dict[key] = value.strip()
return result_dict
def _extract_value(self, tag) -> str:
value = ""
next_tag = tag.find_next_sibling()
while next_tag and not (next_tag.name == 'p' and
next_tag.get('data-category') == 'paragraph' and
next_tag.decode_contents().strip() in self.desired_keys):
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
class SubjectAchievementExtractor:
def __init__(self, chapter_content: str):
self.soup = HTMLParser.parse(chapter_content)
def extract(self) -> Dict[str, str]:
subject_achievement_per_grade = {}
paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'})
pattern = r'^\[(\d+)ํ•™๋…„\]$'
for tag in paragraph_tags:
key = tag.decode_contents().strip()
if re.match(pattern, key):
grade_key = key.strip()
value = self._extract_value(tag, pattern)
subject_achievement_per_grade[grade_key] = value.strip()
return subject_achievement_per_grade
def _extract_value(self, tag, pattern) -> str:
value = str(tag)
next_tag = tag.find_next_sibling()
while next_tag and not (next_tag.name == 'p' and
next_tag.get('data-category') == 'paragraph' and
re.match(pattern, next_tag.decode_contents().strip())):
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
class CourseTypeExtractor:
def __init__(self, grade_content: str):
self.grade_soup = HTMLParser.parse(grade_content)
def extract(self) -> Dict[str, str]:
course_types = {}
grade_paragraph_tags = self.grade_soup.find_all()
course_type = "BASIC"
is_first = True
for tag in grade_paragraph_tags:
key = tag.decode_contents().strip()
if is_first:
value = self._extract_basic_course(tag)
course_types[f"{course_type}"] = value.strip()
is_first = False
if "&lt;" in key and "&gt;" in key:
if "์ง„๋กœ" in key and "์„ ํƒ" in key and "๊ณผ๋ชฉ" in key:
course_type = "CAREER"
value = self._extract_career_course(tag)
course_types[f"{course_type}"] = value.strip()
if "์ฒด์œก" in key and "์˜ˆ์ˆ " in key:
course_type = "PE_ARTS"
value = self._extract_pe_arts_course(tag)
course_types[f"{course_type}"] = value.strip()
return course_types
def _extract_basic_course(self, tag) -> str:
value = ""
next_tag = tag.find_next_sibling()
while next_tag and not (("&lt;" in next_tag.decode_contents().strip() and
"&gt;" in next_tag.decode_contents().strip()) and
("์ง„๋กœ" in next_tag.decode_contents().strip() and
"์„ ํƒ" in next_tag.decode_contents().strip() and
"๊ณผ๋ชฉ" in next_tag.decode_contents().strip())):
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
def _extract_career_course(self, tag) -> str:
value = ""
next_tag = tag.find_next_sibling()
while next_tag and not (("&lt;" in next_tag.decode_contents().strip() and
"&gt;" in next_tag.decode_contents().strip()) and
("์ฒด์œก" in next_tag.decode_contents().strip() and
"์˜ˆ์ˆ " in next_tag.decode_contents().strip())):
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
def _extract_pe_arts_course(self, tag) -> str:
value = ""
next_tag = tag.find_next_sibling()
while next_tag:
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
class DocumentProcessor:
def __init__(self, file_path: str):
self.file_path = file_path
self.loader = UpstageDocumentParseLoader(file_path, ocr="force", output_format="html")
self.desired_keys = [
'4. ์ž๊ฒฉ์ฆ ๋ฐ ์ธ์ฆ ์ทจ๋“์ƒํ™ฉ',
'5. ์ฐฝ์˜์  ์ฒดํ—˜ํ™œ๋™์ƒํ™ฉ',
'6. ๊ต๊ณผํ•™์Šต๋ฐœ๋‹ฌ์ƒํ™ฉ',
'7. ๋…์„œํ™œ๋™์ƒํ™ฉ',
'8. ํ–‰๋™ํŠน์„ฑ ๋ฐ ์ข…ํ•ฉ์˜๊ฒฌ'
]
def process(self):
html_contents = self._load_document()
soup = HTMLParser.parse(html_contents)
chapter_extractor = ChapterExtractor(soup, self.desired_keys)
chapters = chapter_extractor.extract()
subject_achievement = self._process_subject_achievement(chapters)
course_types = self._process_course_types(subject_achievement)
return chapters, subject_achievement, course_types
def _load_document(self) -> str:
pages = self.loader.load()
return "".join(page.page_content for page in pages)
def _process_subject_achievement(self, chapters: Dict[str, str]) -> Dict[str, str]:
if '6. ๊ต๊ณผํ•™์Šต๋ฐœ๋‹ฌ์ƒํ™ฉ' in chapters:
extractor = SubjectAchievementExtractor(chapters['6. ๊ต๊ณผํ•™์Šต๋ฐœ๋‹ฌ์ƒํ™ฉ'])
return extractor.extract()
return {}
def _process_course_types(self, subject_achievement: Dict[str, str]) -> Dict[str, Dict[str, str]]:
course_types = {}
for grade, content in subject_achievement.items():
extractor = CourseTypeExtractor(content)
course_types[grade] = extractor.extract()
return course_types
class CreativeExperientialActivity(BaseModel):
activityType: str
grade: int
hours: int
specialNotes: str
areaOfInterest: str
class AcademicAchievement(BaseModel):
courseType: str
grade: int
semester: int
subject: str
course: str
detailedAbilities: str
class AcademicAchievementResult(BaseModel):
academicAchievement: List[AcademicAchievement]
class CreativeExperientialActivityResult(BaseModel):
creativeExperientialActivities: List[CreativeExperientialActivity]
class OcrResult(BaseModel):
creativeExperientialActivities: List[CreativeExperientialActivity]
academicAchievement: List[AcademicAchievement]
continuation_prompt = """
์•„๋ž˜ latest-response๋Š” ์ด์ „ ์‘๋‹ต์˜ ๋งˆ์ง€๋ง‰ ๋ถ€๋ถ„์ž…๋‹ˆ๋‹ค.
respond from where you left off.
your response would be appended to the latest-response, so there is no need to include the latest-response in your response.
your response appended to latest-response should be a full json output format.
if there is new line space remove them.
<latest-response>
{latest-response}
</latest-response>
"""
def clean_json_output(output):
output = output.strip()
if output.startswith("```json"):
output = output[7:]
if output.endswith("```"):
output = output[:-3]
cleaned_output = output.strip()
return cleaned_output
def get_complete_response(client, prompt, input: dict):
full_response = ""
original_prompt = ChatPromptTemplate.from_messages(
messages=[("system", "You are a tutor for a high school student"),
("human", prompt)]
)
chain = original_prompt | client
response = chain.invoke(input)
print(response)
full_response += clean_json_output(response.content)
while True:
#if not response.response_metadata["stop_reason"] == "max_tokens":
if not response.response_metadata["finish_reason"] == "length":
break
next_prompt = prompt + continuation_prompt
next_prompt_message = ChatPromptTemplate.from_messages(
messages=[("system", "You are a tutor for a high school student"),
("human", next_prompt)]
)
chain = next_prompt_message | client
input["latest-response"] = response.content
response = chain.invoke(input)
full_response += clean_json_output(response.content)
return full_response
def ocr_process(pdfFile: str):
file_path = pdfFile
processor = DocumentProcessor(file_path)
chapters, subject_achievement, course_types = processor.process()
print("Chapters:", chapters)
print("Subject Achievement:", subject_achievement)
print("Course Types:", course_types)
claude = ChatAnthropic(
model="claude-3-5-sonnet-20240620",
temperature=0,
max_tokens=8192,
timeout=None,
max_retries=2,
)
creative_response = get_complete_response(gpt_4o, creative_experiential_activities_prompt, {
"creative-experiential-activities-html": chapters['5. ์ฐฝ์˜์  ์ฒดํ—˜ํ™œ๋™์ƒํ™ฉ'],
})
print("Response: " + creative_response)
creative_result = CreativeExperientialActivityResult.model_validate(json.loads(creative_response))
creative_activities = creative_result.creativeExperientialActivities
print("-----ํ•™๋…„๋ณ„ ํ•ญ๋ชฉ๋ณ„ ๊ต๊ณผํ•™์Šต๋ฐœ๋‹ฌ์ƒํ™ฉ----")
print(course_types)
academic_results = []
for courseType, grade_achievement in course_types.items():
for grade, grade_content in grade_achievement.items():
academic_response = get_complete_response(gpt_4o, academic_achievement_prompt,{"grade-course-type": f"{grade}_{courseType}","academic-achievement-html": grade_content} )
result = AcademicAchievementResult.model_validate(json.loads(academic_response))
print("----full response of academic_response----")
print(result)
for value in result.academicAchievement:
academic_results.append(value)
print(academic_results)
print("----๊ฒฐ๊ณผ in json----")
print(json.dumps({
"creative-experiential-activity": [activity.model_dump() for activity in creative_activities],
"academic-achievements": [achievement.model_dump() for achievement in academic_results]
}, ensure_ascii=False, indent=4))
result = OcrResult(creativeExperientialActivities=creative_activities, academicAchievement=academic_results)
return result