import json import os from bs4 import BeautifulSoup import re from typing import List, Dict from pydantic import BaseModel from langchain_openai import ChatOpenAI from langchain_anthropic import ChatAnthropic from langchain_core.prompts import ChatPromptTemplate from langchain_upstage import UpstageDocumentParseLoader openai_api_key = os.environ["OPENAI_API_KEY"] gpt_4o_mini = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=openai_api_key) gpt_4o = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key) creative_experiential_activities_prompt = """ 아래 형식으로 매핑해서 응답으로 전달해줘. 원문의 누락이 있어선 안되고, 가끔 오타가 있거나 문맥에 맞지 않은 띄어쓰기가 있다면 너의 판단에 수정해서 전달해줘. 다시한번 말하지만 절대로 응답에 각색이나 누락이 있으면 안돼. areaOfInterest는 희망분야를 의미하며 진로활동에만 들어가. areaOfInterest는 그리고 희망분야라는 단어를 뺀 내용들만 들어가면 돼 areaOfInterest가 없는 경우엔 빈 문자열로 처리해줘. output은 json만 전달해줘. {creative-experiential-activities-html} {{ //창의적 체험활동사항 "creativeExperientialActivities": [ {{ //영역 "activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER //학년 "grade": "1", (integer) //시간 "hours": "", (integer) //특기사항 "specialNotes": "", (string) //희망분야 "areaOfInterest": "" (string) }}, {{ "activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER "grade": "1", //학년 "hours": "", "specialNotes": "", "areaOfInterest": "" }}, {{ "activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER "grade": "1", //학년 "hours": "", "specialNotes": "", "areaOfInterest": "" }}, {{ "activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER "grade": "2", //학년 "hours": "", "specialNotes": "", "areaOfInterest": "" }}, {{ "activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER "grade": "2", //학년 "hours": "", "specialNotes": "", "areaOfInterest": "" }}, {{ "activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER "grade": "2", //학년 "hours": "", "specialNotes": "", "areaOfInterest": "" }}, {{ "activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER "grade": 3, //학년 "hours": "", "specialNotes": "", "areaOfInterest": "" }}, {{ "activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER "grade": "3", //학년 "hours": "", "specialNotes": "", "areaOfInterest": "" }}, {{ "activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER "grade": "3", //학년 "hours": "", "specialNotes": "", "areaOfInterest": "" }}, ], }} """ academic_achievement_prompt = """ 아래 형식으로 매핑해서 응답으로 전달해줘. 학년과 course type은는 에 나와있을 예정이야. academic-achievement-html의 세부 능력 및 특기사항엔 교과, 과목에 대한 학생의 상세 성취 및 특기사항이 들어가고 있어. 세부능력 및 특기사항 아래의 내용은 아래처럼 구성되어 있으니까 잘 참고해서 교과목별로 잘 할당해서 매핑해줘 세부능력 및 특기사항을 아주 꼼꼼히 확인해서 응답해줘. - 교과목 : 특기사항 및 성취도 - 교과목에 학기가 나눠져서 작성되어 있는 경우엔 그에 맞게 매핑해주면 돼. - 만약 교과목에 학기가 명확히 나눠져있거나 명시되어 있지 않고, 1학기 2학기 모두에 해당 교과목이 있는 경우엔 1학기에만 매핑해주고 2학기는 빈칸으로 전달해줘. 원문의 누락이 있어선 안되고, 가끔 오타가 있거나 문맥에 맞지 않은 띄어쓰기가 있다면 너의 판단에 수정해서 전달해줘. 다시한번 말하지만 절대로 응답에 각색이나 누락이 있으면 안돼. 확인해보고 전달할 응답값이 없다면 빈 응답값을 줘도 괜찮아. output은 json만 전달해줘. {grade-course-type} {academic-achievement-html} {{ //교과학습발달상황 "academicAchievement": [ {{ // 기본, 진로선택과목, 체육, 예술 등 교과의 type "courseType": "BASIC", //BASIC(기본), CAREER(진로 선택 과목), PE_ARTS(체육, 예술) "grade": 1, //학년 "semester": 1, //학기 either 1 or 2 "subject": "국어", // 교과 "course": "국어", // 과목 //세부능력 및 특기사항 표아래에 각 과목 아래 내용 "detailedAbilities": "" //세부능력및특기사항 과목별 분리 }}, {{ "courseType": "BASIC", //BASIC(기본), CAREER(진로 선택 과목), PE_ARTS(체육, 예술) "grade": 1, //학년 "semester": 1, //학기 "subject": "수학", // 교과 "course": "수학", // 과목 "detailedAbilities": "" //세부능력및특기사항 과목별 분리 }}, ... {{ "courseType": "BASIC", //BASIC(기본), CAREER(진로 선택 과목), PE_ARTS(체육, 예술) "grade": 1, //학년 "semester": 2, //학기 "subject": "국어", // 교과 "course": "수학", // 과목 "detailedAbilities": "" //세부능력및특기사항 과목별 분리 }}, ], }} """ class HTMLParser: @staticmethod def parse(html_content: str) -> BeautifulSoup: return BeautifulSoup(html_content, 'html.parser') class ChapterExtractor: def __init__(self, soup: BeautifulSoup, desired_keys: List[str]): self.soup = soup self.desired_keys = desired_keys def extract(self) -> Dict[str, str]: result_dict = {} current_key = self.desired_keys[0] paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'}) for tag in paragraph_tags: key = tag.decode_contents().strip() if key in self.desired_keys: current_key = key value = self._extract_value(tag) result_dict[key] = value.strip() return result_dict def _extract_value(self, tag) -> str: value = "" next_tag = tag.find_next_sibling() while next_tag and not (next_tag.name == 'p' and next_tag.get('data-category') == 'paragraph' and next_tag.decode_contents().strip() in self.desired_keys): value += str(next_tag) next_tag = next_tag.find_next_sibling() return value class SubjectAchievementExtractor: def __init__(self, chapter_content: str): self.soup = HTMLParser.parse(chapter_content) def extract(self) -> Dict[str, str]: subject_achievement_per_grade = {} paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'}) pattern = r'^\[(\d+)학년\]$' for tag in paragraph_tags: key = tag.decode_contents().strip() if re.match(pattern, key): grade_key = key.strip() value = self._extract_value(tag, pattern) subject_achievement_per_grade[grade_key] = value.strip() return subject_achievement_per_grade def _extract_value(self, tag, pattern) -> str: value = str(tag) next_tag = tag.find_next_sibling() while next_tag and not (next_tag.name == 'p' and next_tag.get('data-category') == 'paragraph' and re.match(pattern, next_tag.decode_contents().strip())): value += str(next_tag) next_tag = next_tag.find_next_sibling() return value class CourseTypeExtractor: def __init__(self, grade_content: str): self.grade_soup = HTMLParser.parse(grade_content) def extract(self) -> Dict[str, str]: course_types = {} grade_paragraph_tags = self.grade_soup.find_all() course_type = "BASIC" is_first = True for tag in grade_paragraph_tags: key = tag.decode_contents().strip() if is_first: value = self._extract_basic_course(tag) course_types[f"{course_type}"] = value.strip() is_first = False if "<" in key and ">" in key: if "진로" in key and "선택" in key and "과목" in key: course_type = "CAREER" value = self._extract_career_course(tag) course_types[f"{course_type}"] = value.strip() if "체육" in key and "예술" in key: course_type = "PE_ARTS" value = self._extract_pe_arts_course(tag) course_types[f"{course_type}"] = value.strip() return course_types def _extract_basic_course(self, tag) -> str: value = "" next_tag = tag.find_next_sibling() while next_tag and not (("<" in next_tag.decode_contents().strip() and ">" in next_tag.decode_contents().strip()) and ("진로" in next_tag.decode_contents().strip() and "선택" in next_tag.decode_contents().strip() and "과목" in next_tag.decode_contents().strip())): value += str(next_tag) next_tag = next_tag.find_next_sibling() return value def _extract_career_course(self, tag) -> str: value = "" next_tag = tag.find_next_sibling() while next_tag and not (("<" in next_tag.decode_contents().strip() and ">" in next_tag.decode_contents().strip()) and ("체육" in next_tag.decode_contents().strip() and "예술" in next_tag.decode_contents().strip())): value += str(next_tag) next_tag = next_tag.find_next_sibling() return value def _extract_pe_arts_course(self, tag) -> str: value = "" next_tag = tag.find_next_sibling() while next_tag: value += str(next_tag) next_tag = next_tag.find_next_sibling() return value class DocumentProcessor: def __init__(self, file_path: str): self.file_path = file_path self.loader = UpstageDocumentParseLoader(file_path, ocr="force", output_format="html") self.desired_keys = [ '4. 자격증 및 인증 취득상황', '5. 창의적 체험활동상황', '6. 교과학습발달상황', '7. 독서활동상황', '8. 행동특성 및 종합의견' ] def process(self): html_contents = self._load_document() soup = HTMLParser.parse(html_contents) chapter_extractor = ChapterExtractor(soup, self.desired_keys) chapters = chapter_extractor.extract() subject_achievement = self._process_subject_achievement(chapters) course_types = self._process_course_types(subject_achievement) return chapters, subject_achievement, course_types def _load_document(self) -> str: pages = self.loader.load() return "".join(page.page_content for page in pages) def _process_subject_achievement(self, chapters: Dict[str, str]) -> Dict[str, str]: if '6. 교과학습발달상황' in chapters: extractor = SubjectAchievementExtractor(chapters['6. 교과학습발달상황']) return extractor.extract() return {} def _process_course_types(self, subject_achievement: Dict[str, str]) -> Dict[str, Dict[str, str]]: course_types = {} for grade, content in subject_achievement.items(): extractor = CourseTypeExtractor(content) course_types[grade] = extractor.extract() return course_types class CreativeExperientialActivity(BaseModel): activityType: str grade: int hours: int specialNotes: str areaOfInterest: str class AcademicAchievement(BaseModel): courseType: str grade: int semester: int subject: str course: str detailedAbilities: str class AcademicAchievementResult(BaseModel): academicAchievement: List[AcademicAchievement] class CreativeExperientialActivityResult(BaseModel): creativeExperientialActivities: List[CreativeExperientialActivity] class OcrResult(BaseModel): creativeExperientialActivities: List[CreativeExperientialActivity] academicAchievement: List[AcademicAchievement] continuation_prompt = """ 아래 latest-response는 이전 응답의 마지막 부분입니다. respond from where you left off. your response would be appended to the latest-response, so there is no need to include the latest-response in your response. your response appended to latest-response should be a full json output format. if there is new line space remove them. {latest-response} """ def clean_json_output(output): output = output.strip() if output.startswith("```json"): output = output[7:] if output.endswith("```"): output = output[:-3] cleaned_output = output.strip() return cleaned_output def get_complete_response(client, prompt, input: dict): full_response = "" original_prompt = ChatPromptTemplate.from_messages( messages=[("system", "You are a tutor for a high school student"), ("human", prompt)] ) chain = original_prompt | client response = chain.invoke(input) print(response) full_response += clean_json_output(response.content) while True: #if not response.response_metadata["stop_reason"] == "max_tokens": if not response.response_metadata["finish_reason"] == "length": break next_prompt = prompt + continuation_prompt next_prompt_message = ChatPromptTemplate.from_messages( messages=[("system", "You are a tutor for a high school student"), ("human", next_prompt)] ) chain = next_prompt_message | client input["latest-response"] = response.content response = chain.invoke(input) full_response += clean_json_output(response.content) return full_response def ocr_process(pdfFile: str): file_path = pdfFile processor = DocumentProcessor(file_path) chapters, subject_achievement, course_types = processor.process() print("Chapters:", chapters) print("Subject Achievement:", subject_achievement) print("Course Types:", course_types) claude = ChatAnthropic( model="claude-3-5-sonnet-20240620", temperature=0, max_tokens=8192, timeout=None, max_retries=2, ) creative_response = get_complete_response(gpt_4o, creative_experiential_activities_prompt, { "creative-experiential-activities-html": chapters['5. 창의적 체험활동상황'], }) print("Response: " + creative_response) creative_result = CreativeExperientialActivityResult.model_validate(json.loads(creative_response)) creative_activities = creative_result.creativeExperientialActivities print("-----학년별 항목별 교과학습발달상황----") print(course_types) academic_results = [] for courseType, grade_achievement in course_types.items(): for grade, grade_content in grade_achievement.items(): academic_response = get_complete_response(gpt_4o, academic_achievement_prompt,{"grade-course-type": f"{grade}_{courseType}","academic-achievement-html": grade_content} ) result = AcademicAchievementResult.model_validate(json.loads(academic_response)) print("----full response of academic_response----") print(result) for value in result.academicAchievement: academic_results.append(value) print(academic_results) print("----결과 in json----") print(json.dumps({ "creative-experiential-activity": [activity.model_dump() for activity in creative_activities], "academic-achievements": [achievement.model_dump() for achievement in academic_results] }, ensure_ascii=False, indent=4)) result = OcrResult(creativeExperientialActivities=creative_activities, academicAchievement=academic_results) return result