import json
import os
from bs4 import BeautifulSoup
import re
from typing import List, Dict
from pydantic import BaseModel
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from langchain_upstage import UpstageDocumentParseLoader
openai_api_key = os.environ["OPENAI_API_KEY"]
gpt_4o_mini = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=openai_api_key)
gpt_4o = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
creative_experiential_activities_prompt = """
아래 을 형식으로 매핑해서 응답으로 전달해줘.
원문의 누락이 있어선 안되고, 가끔 오타가 있거나 문맥에 맞지 않은 띄어쓰기가 있다면 너의 판단에 수정해서 전달해줘.
다시한번 말하지만 절대로 응답에 각색이나 누락이 있으면 안돼.
areaOfInterest는 희망분야를 의미하며 진로활동에만 들어가.
areaOfInterest는 그리고 희망분야라는 단어를 뺀 내용들만 들어가면 돼
areaOfInterest가 없는 경우엔 빈 문자열로 처리해줘.
output은 json만 전달해줘.
{creative-experiential-activities-html}
{{
//창의적 체험활동사항
"creativeExperientialActivities": [
{{
//영역
"activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER
//학년
"grade": "1", (integer)
//시간
"hours": "", (integer)
//특기사항
"specialNotes": "", (string)
//희망분야
"areaOfInterest": "" (string)
}},
{{
"activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER
"grade": "1", //학년
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER
"grade": "1", //학년
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER
"grade": "2", //학년
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER
"grade": "2", //학년
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER
"grade": "2", //학년
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER
"grade": 3, //학년
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER
"grade": "3", //학년
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
{{
"activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER
"grade": "3", //학년
"hours": "",
"specialNotes": "",
"areaOfInterest": ""
}},
],
}}
"""
academic_achievement_prompt = """
아래 을 형식으로 매핑해서 응답으로 전달해줘.
학년과 course type은는 에 나와있을 예정이야.
academic-achievement-html의 세부 능력 및 특기사항엔 교과, 과목에 대한 학생의 상세 성취 및 특기사항이 들어가고 있어.
세부능력 및 특기사항 아래의 내용은 아래처럼 구성되어 있으니까 잘 참고해서 교과목별로 잘 할당해서 매핑해줘
세부능력 및 특기사항을 아주 꼼꼼히 확인해서 응답해줘.
- 교과목 : 특기사항 및 성취도
- 교과목에 학기가 나눠져서 작성되어 있는 경우엔 그에 맞게 매핑해주면 돼.
- 만약 교과목에 학기가 명확히 나눠져있거나 명시되어 있지 않고, 1학기 2학기 모두에 해당 교과목이 있는 경우엔 1학기에만 매핑해주고 2학기는 빈칸으로 전달해줘.
원문의 누락이 있어선 안되고, 가끔 오타가 있거나 문맥에 맞지 않은 띄어쓰기가 있다면 너의 판단에 수정해서 전달해줘.
다시한번 말하지만 절대로 응답에 각색이나 누락이 있으면 안돼.
확인해보고 전달할 응답값이 없다면 빈 응답값을 줘도 괜찮아.
output은 json만 전달해줘.
{grade-course-type}
{academic-achievement-html}
{{
//교과학습발달상황
"academicAchievement": [
{{
// 기본, 진로선택과목, 체육, 예술 등 교과의 type
"courseType": "BASIC", //BASIC(기본), CAREER(진로 선택 과목), PE_ARTS(체육, 예술)
"grade": 1, //학년
"semester": 1, //학기 either 1 or 2
"subject": "국어", // 교과
"course": "국어", // 과목
//세부능력 및 특기사항 표아래에 각 과목 아래 내용
"detailedAbilities": "" //세부능력및특기사항 과목별 분리
}},
{{
"courseType": "BASIC", //BASIC(기본), CAREER(진로 선택 과목), PE_ARTS(체육, 예술)
"grade": 1, //학년
"semester": 1, //학기
"subject": "수학", // 교과
"course": "수학", // 과목
"detailedAbilities": "" //세부능력및특기사항 과목별 분리
}}, ...
{{
"courseType": "BASIC", //BASIC(기본), CAREER(진로 선택 과목), PE_ARTS(체육, 예술)
"grade": 1, //학년
"semester": 2, //학기
"subject": "국어", // 교과
"course": "수학", // 과목
"detailedAbilities": "" //세부능력및특기사항 과목별 분리
}},
],
}}
"""
class HTMLParser:
@staticmethod
def parse(html_content: str) -> BeautifulSoup:
return BeautifulSoup(html_content, 'html.parser')
class ChapterExtractor:
def __init__(self, soup: BeautifulSoup, desired_keys: List[str]):
self.soup = soup
self.desired_keys = desired_keys
def extract(self) -> Dict[str, str]:
result_dict = {}
current_key = self.desired_keys[0]
paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'})
for tag in paragraph_tags:
key = tag.decode_contents().strip()
if key in self.desired_keys:
current_key = key
value = self._extract_value(tag)
result_dict[key] = value.strip()
return result_dict
def _extract_value(self, tag) -> str:
value = ""
next_tag = tag.find_next_sibling()
while next_tag and not (next_tag.name == 'p' and
next_tag.get('data-category') == 'paragraph' and
next_tag.decode_contents().strip() in self.desired_keys):
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
class SubjectAchievementExtractor:
def __init__(self, chapter_content: str):
self.soup = HTMLParser.parse(chapter_content)
def extract(self) -> Dict[str, str]:
subject_achievement_per_grade = {}
paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'})
pattern = r'^\[(\d+)학년\]$'
for tag in paragraph_tags:
key = tag.decode_contents().strip()
if re.match(pattern, key):
grade_key = key.strip()
value = self._extract_value(tag, pattern)
subject_achievement_per_grade[grade_key] = value.strip()
return subject_achievement_per_grade
def _extract_value(self, tag, pattern) -> str:
value = str(tag)
next_tag = tag.find_next_sibling()
while next_tag and not (next_tag.name == 'p' and
next_tag.get('data-category') == 'paragraph' and
re.match(pattern, next_tag.decode_contents().strip())):
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
class CourseTypeExtractor:
def __init__(self, grade_content: str):
self.grade_soup = HTMLParser.parse(grade_content)
def extract(self) -> Dict[str, str]:
course_types = {}
grade_paragraph_tags = self.grade_soup.find_all()
course_type = "BASIC"
is_first = True
for tag in grade_paragraph_tags:
key = tag.decode_contents().strip()
if is_first:
value = self._extract_basic_course(tag)
course_types[f"{course_type}"] = value.strip()
is_first = False
if "<" in key and ">" in key:
if "진로" in key and "선택" in key and "과목" in key:
course_type = "CAREER"
value = self._extract_career_course(tag)
course_types[f"{course_type}"] = value.strip()
if "체육" in key and "예술" in key:
course_type = "PE_ARTS"
value = self._extract_pe_arts_course(tag)
course_types[f"{course_type}"] = value.strip()
return course_types
def _extract_basic_course(self, tag) -> str:
value = ""
next_tag = tag.find_next_sibling()
while next_tag and not (("<" in next_tag.decode_contents().strip() and
">" in next_tag.decode_contents().strip()) and
("진로" in next_tag.decode_contents().strip() and
"선택" in next_tag.decode_contents().strip() and
"과목" in next_tag.decode_contents().strip())):
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
def _extract_career_course(self, tag) -> str:
value = ""
next_tag = tag.find_next_sibling()
while next_tag and not (("<" in next_tag.decode_contents().strip() and
">" in next_tag.decode_contents().strip()) and
("체육" in next_tag.decode_contents().strip() and
"예술" in next_tag.decode_contents().strip())):
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
def _extract_pe_arts_course(self, tag) -> str:
value = ""
next_tag = tag.find_next_sibling()
while next_tag:
value += str(next_tag)
next_tag = next_tag.find_next_sibling()
return value
class DocumentProcessor:
def __init__(self, file_path: str):
self.file_path = file_path
self.loader = UpstageDocumentParseLoader(file_path, ocr="force", output_format="html")
self.desired_keys = [
'4. 자격증 및 인증 취득상황',
'5. 창의적 체험활동상황',
'6. 교과학습발달상황',
'7. 독서활동상황',
'8. 행동특성 및 종합의견'
]
def process(self):
html_contents = self._load_document()
soup = HTMLParser.parse(html_contents)
chapter_extractor = ChapterExtractor(soup, self.desired_keys)
chapters = chapter_extractor.extract()
subject_achievement = self._process_subject_achievement(chapters)
course_types = self._process_course_types(subject_achievement)
return chapters, subject_achievement, course_types
def _load_document(self) -> str:
pages = self.loader.load()
return "".join(page.page_content for page in pages)
def _process_subject_achievement(self, chapters: Dict[str, str]) -> Dict[str, str]:
if '6. 교과학습발달상황' in chapters:
extractor = SubjectAchievementExtractor(chapters['6. 교과학습발달상황'])
return extractor.extract()
return {}
def _process_course_types(self, subject_achievement: Dict[str, str]) -> Dict[str, Dict[str, str]]:
course_types = {}
for grade, content in subject_achievement.items():
extractor = CourseTypeExtractor(content)
course_types[grade] = extractor.extract()
return course_types
class CreativeExperientialActivity(BaseModel):
activityType: str
grade: int
hours: int
specialNotes: str
areaOfInterest: str
class AcademicAchievement(BaseModel):
courseType: str
grade: int
semester: int
subject: str
course: str
detailedAbilities: str
class AcademicAchievementResult(BaseModel):
academicAchievement: List[AcademicAchievement]
class CreativeExperientialActivityResult(BaseModel):
creativeExperientialActivities: List[CreativeExperientialActivity]
class OcrResult(BaseModel):
creativeExperientialActivities: List[CreativeExperientialActivity]
academicAchievement: List[AcademicAchievement]
continuation_prompt = """
아래 latest-response는 이전 응답의 마지막 부분입니다.
respond from where you left off.
your response would be appended to the latest-response, so there is no need to include the latest-response in your response.
your response appended to latest-response should be a full json output format.
if there is new line space remove them.
{latest-response}
"""
def clean_json_output(output):
output = output.strip()
if output.startswith("```json"):
output = output[7:]
if output.endswith("```"):
output = output[:-3]
cleaned_output = output.strip()
return cleaned_output
def get_complete_response(client, prompt, input: dict):
full_response = ""
original_prompt = ChatPromptTemplate.from_messages(
messages=[("system", "You are a tutor for a high school student"),
("human", prompt)]
)
chain = original_prompt | client
response = chain.invoke(input)
print(response)
full_response += clean_json_output(response.content)
while True:
#if not response.response_metadata["stop_reason"] == "max_tokens":
if not response.response_metadata["finish_reason"] == "length":
break
next_prompt = prompt + continuation_prompt
next_prompt_message = ChatPromptTemplate.from_messages(
messages=[("system", "You are a tutor for a high school student"),
("human", next_prompt)]
)
chain = next_prompt_message | client
input["latest-response"] = response.content
response = chain.invoke(input)
full_response += clean_json_output(response.content)
return full_response
def ocr_process(pdfFile: str):
file_path = pdfFile
processor = DocumentProcessor(file_path)
chapters, subject_achievement, course_types = processor.process()
print("Chapters:", chapters)
print("Subject Achievement:", subject_achievement)
print("Course Types:", course_types)
claude = ChatAnthropic(
model="claude-3-5-sonnet-20240620",
temperature=0,
max_tokens=8192,
timeout=None,
max_retries=2,
)
creative_response = get_complete_response(gpt_4o, creative_experiential_activities_prompt, {
"creative-experiential-activities-html": chapters['5. 창의적 체험활동상황'],
})
print("Response: " + creative_response)
creative_result = CreativeExperientialActivityResult.model_validate(json.loads(creative_response))
creative_activities = creative_result.creativeExperientialActivities
print("-----학년별 항목별 교과학습발달상황----")
print(course_types)
academic_results = []
for courseType, grade_achievement in course_types.items():
for grade, grade_content in grade_achievement.items():
academic_response = get_complete_response(gpt_4o, academic_achievement_prompt,{"grade-course-type": f"{grade}_{courseType}","academic-achievement-html": grade_content} )
result = AcademicAchievementResult.model_validate(json.loads(academic_response))
print("----full response of academic_response----")
print(result)
for value in result.academicAchievement:
academic_results.append(value)
print(academic_results)
print("----결과 in json----")
print(json.dumps({
"creative-experiential-activity": [activity.model_dump() for activity in creative_activities],
"academic-achievements": [achievement.model_dump() for achievement in academic_results]
}, ensure_ascii=False, indent=4))
result = OcrResult(creativeExperientialActivities=creative_activities, academicAchievement=academic_results)
return result