Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| from bs4 import BeautifulSoup | |
| import re | |
| from typing import List, Dict | |
| from pydantic import BaseModel | |
| from langchain_openai import ChatOpenAI | |
| from langchain_anthropic import ChatAnthropic | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_upstage import UpstageDocumentParseLoader | |
| openai_api_key = os.environ["OPENAI_API_KEY"] | |
| gpt_4o_mini = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=openai_api_key) | |
| gpt_4o = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key) | |
| creative_experiential_activities_prompt = """ | |
| <instructions> | |
| ์๋ <creative-experiential-activities-html>์ <creative-experiential-activities-json>ํ์์ผ๋ก ๋งคํํด์ ์๋ต์ผ๋ก ์ ๋ฌํด์ค. | |
| ์๋ฌธ์ ๋๋ฝ์ด ์์ด์ ์๋๊ณ , ๊ฐ๋ ์คํ๊ฐ ์๊ฑฐ๋ ๋ฌธ๋งฅ์ ๋ง์ง ์์ ๋์ด์ฐ๊ธฐ๊ฐ ์๋ค๋ฉด ๋์ ํ๋จ์ ์์ ํด์ ์ ๋ฌํด์ค. | |
| ๋ค์ํ๋ฒ ๋งํ์ง๋ง ์ ๋๋ก ์๋ต์ ๊ฐ์์ด๋ ๋๋ฝ์ด ์์ผ๋ฉด ์๋ผ. | |
| areaOfInterest๋ ํฌ๋ง๋ถ์ผ๋ฅผ ์๋ฏธํ๋ฉฐ ์ง๋กํ๋์๋ง ๋ค์ด๊ฐ. | |
| areaOfInterest๋ ๊ทธ๋ฆฌ๊ณ ํฌ๋ง๋ถ์ผ๋ผ๋ ๋จ์ด๋ฅผ ๋บ ๋ด์ฉ๋ค๋ง ๋ค์ด๊ฐ๋ฉด ๋ผ | |
| areaOfInterest๊ฐ ์๋ ๊ฒฝ์ฐ์ ๋น ๋ฌธ์์ด๋ก ์ฒ๋ฆฌํด์ค. | |
| output์ json๋ง ์ ๋ฌํด์ค. | |
| </instructions> | |
| <creative-experiential-activities-html> | |
| {creative-experiential-activities-html} | |
| </creative-experiential-activities-html> | |
| <creative-experiential-activities-json> | |
| {{ | |
| //์ฐฝ์์ ์ฒดํํ๋์ฌํญ | |
| "creativeExperientialActivities": [ | |
| {{ | |
| //์์ญ | |
| "activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER | |
| //ํ๋ | |
| "grade": "1", (integer) | |
| //์๊ฐ | |
| "hours": "", (integer) | |
| //ํน๊ธฐ์ฌํญ | |
| "specialNotes": "", (string) | |
| //ํฌ๋ง๋ถ์ผ | |
| "areaOfInterest": "" (string) | |
| }}, | |
| {{ | |
| "activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER | |
| "grade": "1", //ํ๋ | |
| "hours": "", | |
| "specialNotes": "", | |
| "areaOfInterest": "" | |
| }}, | |
| {{ | |
| "activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER | |
| "grade": "1", //ํ๋ | |
| "hours": "", | |
| "specialNotes": "", | |
| "areaOfInterest": "" | |
| }}, | |
| {{ | |
| "activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER | |
| "grade": "2", //ํ๋ | |
| "hours": "", | |
| "specialNotes": "", | |
| "areaOfInterest": "" | |
| }}, | |
| {{ | |
| "activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER | |
| "grade": "2", //ํ๋ | |
| "hours": "", | |
| "specialNotes": "", | |
| "areaOfInterest": "" | |
| }}, | |
| {{ | |
| "activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER | |
| "grade": "2", //ํ๋ | |
| "hours": "", | |
| "specialNotes": "", | |
| "areaOfInterest": "" | |
| }}, | |
| {{ | |
| "activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER | |
| "grade": 3, //ํ๋ | |
| "hours": "", | |
| "specialNotes": "", | |
| "areaOfInterest": "" | |
| }}, | |
| {{ | |
| "activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER | |
| "grade": "3", //ํ๋ | |
| "hours": "", | |
| "specialNotes": "", | |
| "areaOfInterest": "" | |
| }}, | |
| {{ | |
| "activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER | |
| "grade": "3", //ํ๋ | |
| "hours": "", | |
| "specialNotes": "", | |
| "areaOfInterest": "" | |
| }}, | |
| ], | |
| }} | |
| </creative-experiential-activities-json> | |
| """ | |
| academic_achievement_prompt = """ | |
| <instructions> | |
| <instructions> | |
| ์๋ <academic-achievement-html>์ <academic-achievement-json>ํ์์ผ๋ก ๋งคํํด์ ์๋ต์ผ๋ก ์ ๋ฌํด์ค. | |
| ํ๋ ๊ณผ course type์๋ <grade-course-type>์ ๋์์์ ์์ ์ด์ผ. | |
| academic-achievement-html์ ์ธ๋ถ ๋ฅ๋ ฅ ๋ฐ ํน๊ธฐ์ฌํญ์ ๊ต๊ณผ, ๊ณผ๋ชฉ์ ๋ํ ํ์์ ์์ธ ์ฑ์ทจ ๋ฐ ํน๊ธฐ์ฌํญ์ด ๋ค์ด๊ฐ๊ณ ์์ด. | |
| ์ธ๋ถ๋ฅ๋ ฅ ๋ฐ ํน๊ธฐ์ฌํญ ์๋์ ๋ด์ฉ์ ์๋์ฒ๋ผ ๊ตฌ์ฑ๋์ด ์์ผ๋๊น ์ ์ฐธ๊ณ ํด์ ๊ต๊ณผ๋ชฉ๋ณ๋ก ์ ํ ๋นํด์ ๋งคํํด์ค | |
| ์ธ๋ถ๋ฅ๋ ฅ ๋ฐ ํน๊ธฐ์ฌํญ์ ์์ฃผ ๊ผผ๊ผผํ ํ์ธํด์ ์๋ตํด์ค. | |
| - ๊ต๊ณผ๋ชฉ : ํน๊ธฐ์ฌํญ ๋ฐ ์ฑ์ทจ๋ | |
| - ๊ต๊ณผ๋ชฉ์ ํ๊ธฐ๊ฐ ๋๋ ์ ธ์ ์์ฑ๋์ด ์๋ ๊ฒฝ์ฐ์ ๊ทธ์ ๋ง๊ฒ ๋งคํํด์ฃผ๋ฉด ๋ผ. | |
| - ๋ง์ฝ ๊ต๊ณผ๋ชฉ์ ํ๊ธฐ๊ฐ ๋ช ํํ ๋๋ ์ ธ์๊ฑฐ๋ ๋ช ์๋์ด ์์ง ์๊ณ , 1ํ๊ธฐ 2ํ๊ธฐ ๋ชจ๋์ ํด๋น ๊ต๊ณผ๋ชฉ์ด ์๋ ๊ฒฝ์ฐ์ 1ํ๊ธฐ์๋ง ๋งคํํด์ฃผ๊ณ 2ํ๊ธฐ๋ ๋น์นธ์ผ๋ก ์ ๋ฌํด์ค. | |
| ์๋ฌธ์ ๋๋ฝ์ด ์์ด์ ์๋๊ณ , ๊ฐ๋ ์คํ๊ฐ ์๊ฑฐ๋ ๋ฌธ๋งฅ์ ๋ง์ง ์์ ๋์ด์ฐ๊ธฐ๊ฐ ์๋ค๋ฉด ๋์ ํ๋จ์ ์์ ํด์ ์ ๋ฌํด์ค. | |
| ๋ค์ํ๋ฒ ๋งํ์ง๋ง ์ ๋๋ก ์๋ต์ ๊ฐ์์ด๋ ๋๋ฝ์ด ์์ผ๋ฉด ์๋ผ. | |
| ํ์ธํด๋ณด๊ณ ์ ๋ฌํ ์๋ต๊ฐ์ด ์๋ค๋ฉด ๋น ์๋ต๊ฐ์ ์ค๋ ๊ด์ฐฎ์. | |
| output์ json๋ง ์ ๋ฌํด์ค. | |
| </instructions> | |
| <grade-course-type> | |
| {grade-course-type} | |
| </grade-course-type> | |
| <academic-achievement-html> | |
| {academic-achievement-html} | |
| </academic-achievement-html> | |
| <academic-achievement-json> | |
| {{ | |
| //๊ต๊ณผํ์ต๋ฐ๋ฌ์ํฉ | |
| "academicAchievement": [ | |
| {{ | |
| // ๊ธฐ๋ณธ, ์ง๋ก์ ํ๊ณผ๋ชฉ, ์ฒด์ก, ์์ ๋ฑ ๊ต๊ณผ์ type | |
| "courseType": "BASIC", //BASIC(๊ธฐ๋ณธ), CAREER(์ง๋ก ์ ํ ๊ณผ๋ชฉ), PE_ARTS(์ฒด์ก, ์์ ) | |
| "grade": 1, //ํ๋ | |
| "semester": 1, //ํ๊ธฐ either 1 or 2 | |
| "subject": "๊ตญ์ด", // ๊ต๊ณผ | |
| "course": "๊ตญ์ด", // ๊ณผ๋ชฉ | |
| //์ธ๋ถ๋ฅ๋ ฅ ๋ฐ ํน๊ธฐ์ฌํญ ํ์๋์ ๊ฐ ๊ณผ๋ชฉ ์๋ ๋ด์ฉ | |
| "detailedAbilities": "" //์ธ๋ถ๋ฅ๋ ฅ๋ฐํน๊ธฐ์ฌํญ ๊ณผ๋ชฉ๋ณ ๋ถ๋ฆฌ | |
| }}, | |
| {{ | |
| "courseType": "BASIC", //BASIC(๊ธฐ๋ณธ), CAREER(์ง๋ก ์ ํ ๊ณผ๋ชฉ), PE_ARTS(์ฒด์ก, ์์ ) | |
| "grade": 1, //ํ๋ | |
| "semester": 1, //ํ๊ธฐ | |
| "subject": "์ํ", // ๊ต๊ณผ | |
| "course": "์ํ", // ๊ณผ๋ชฉ | |
| "detailedAbilities": "" //์ธ๋ถ๋ฅ๋ ฅ๋ฐํน๊ธฐ์ฌํญ ๊ณผ๋ชฉ๋ณ ๋ถ๋ฆฌ | |
| }}, ... | |
| {{ | |
| "courseType": "BASIC", //BASIC(๊ธฐ๋ณธ), CAREER(์ง๋ก ์ ํ ๊ณผ๋ชฉ), PE_ARTS(์ฒด์ก, ์์ ) | |
| "grade": 1, //ํ๋ | |
| "semester": 2, //ํ๊ธฐ | |
| "subject": "๊ตญ์ด", // ๊ต๊ณผ | |
| "course": "์ํ", // ๊ณผ๋ชฉ | |
| "detailedAbilities": "" //์ธ๋ถ๋ฅ๋ ฅ๋ฐํน๊ธฐ์ฌํญ ๊ณผ๋ชฉ๋ณ ๋ถ๋ฆฌ | |
| }}, | |
| ], | |
| }} | |
| </academic-achievement-json> | |
| """ | |
| class HTMLParser: | |
| def parse(html_content: str) -> BeautifulSoup: | |
| return BeautifulSoup(html_content, 'html.parser') | |
| class ChapterExtractor: | |
| def __init__(self, soup: BeautifulSoup, desired_keys: List[str]): | |
| self.soup = soup | |
| self.desired_keys = desired_keys | |
| def extract(self) -> Dict[str, str]: | |
| result_dict = {} | |
| current_key = self.desired_keys[0] | |
| paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'}) | |
| for tag in paragraph_tags: | |
| key = tag.decode_contents().strip() | |
| if key in self.desired_keys: | |
| current_key = key | |
| value = self._extract_value(tag) | |
| result_dict[key] = value.strip() | |
| return result_dict | |
| def _extract_value(self, tag) -> str: | |
| value = "" | |
| next_tag = tag.find_next_sibling() | |
| while next_tag and not (next_tag.name == 'p' and | |
| next_tag.get('data-category') == 'paragraph' and | |
| next_tag.decode_contents().strip() in self.desired_keys): | |
| value += str(next_tag) | |
| next_tag = next_tag.find_next_sibling() | |
| return value | |
| class SubjectAchievementExtractor: | |
| def __init__(self, chapter_content: str): | |
| self.soup = HTMLParser.parse(chapter_content) | |
| def extract(self) -> Dict[str, str]: | |
| subject_achievement_per_grade = {} | |
| paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'}) | |
| pattern = r'^\[(\d+)ํ๋ \]$' | |
| for tag in paragraph_tags: | |
| key = tag.decode_contents().strip() | |
| if re.match(pattern, key): | |
| grade_key = key.strip() | |
| value = self._extract_value(tag, pattern) | |
| subject_achievement_per_grade[grade_key] = value.strip() | |
| return subject_achievement_per_grade | |
| def _extract_value(self, tag, pattern) -> str: | |
| value = str(tag) | |
| next_tag = tag.find_next_sibling() | |
| while next_tag and not (next_tag.name == 'p' and | |
| next_tag.get('data-category') == 'paragraph' and | |
| re.match(pattern, next_tag.decode_contents().strip())): | |
| value += str(next_tag) | |
| next_tag = next_tag.find_next_sibling() | |
| return value | |
| class CourseTypeExtractor: | |
| def __init__(self, grade_content: str): | |
| self.grade_soup = HTMLParser.parse(grade_content) | |
| def extract(self) -> Dict[str, str]: | |
| course_types = {} | |
| grade_paragraph_tags = self.grade_soup.find_all() | |
| course_type = "BASIC" | |
| is_first = True | |
| for tag in grade_paragraph_tags: | |
| key = tag.decode_contents().strip() | |
| if is_first: | |
| value = self._extract_basic_course(tag) | |
| course_types[f"{course_type}"] = value.strip() | |
| is_first = False | |
| if "<" in key and ">" in key: | |
| if "์ง๋ก" in key and "์ ํ" in key and "๊ณผ๋ชฉ" in key: | |
| course_type = "CAREER" | |
| value = self._extract_career_course(tag) | |
| course_types[f"{course_type}"] = value.strip() | |
| if "์ฒด์ก" in key and "์์ " in key: | |
| course_type = "PE_ARTS" | |
| value = self._extract_pe_arts_course(tag) | |
| course_types[f"{course_type}"] = value.strip() | |
| return course_types | |
| def _extract_basic_course(self, tag) -> str: | |
| value = "" | |
| next_tag = tag.find_next_sibling() | |
| while next_tag and not (("<" in next_tag.decode_contents().strip() and | |
| ">" in next_tag.decode_contents().strip()) and | |
| ("์ง๋ก" in next_tag.decode_contents().strip() and | |
| "์ ํ" in next_tag.decode_contents().strip() and | |
| "๊ณผ๋ชฉ" in next_tag.decode_contents().strip())): | |
| value += str(next_tag) | |
| next_tag = next_tag.find_next_sibling() | |
| return value | |
| def _extract_career_course(self, tag) -> str: | |
| value = "" | |
| next_tag = tag.find_next_sibling() | |
| while next_tag and not (("<" in next_tag.decode_contents().strip() and | |
| ">" in next_tag.decode_contents().strip()) and | |
| ("์ฒด์ก" in next_tag.decode_contents().strip() and | |
| "์์ " in next_tag.decode_contents().strip())): | |
| value += str(next_tag) | |
| next_tag = next_tag.find_next_sibling() | |
| return value | |
| def _extract_pe_arts_course(self, tag) -> str: | |
| value = "" | |
| next_tag = tag.find_next_sibling() | |
| while next_tag: | |
| value += str(next_tag) | |
| next_tag = next_tag.find_next_sibling() | |
| return value | |
| class DocumentProcessor: | |
| def __init__(self, file_path: str): | |
| self.file_path = file_path | |
| self.loader = UpstageDocumentParseLoader(file_path, ocr="force", output_format="html") | |
| self.desired_keys = [ | |
| '4. ์๊ฒฉ์ฆ ๋ฐ ์ธ์ฆ ์ทจ๋์ํฉ', | |
| '5. ์ฐฝ์์ ์ฒดํํ๋์ํฉ', | |
| '6. ๊ต๊ณผํ์ต๋ฐ๋ฌ์ํฉ', | |
| '7. ๋ ์ํ๋์ํฉ', | |
| '8. ํ๋ํน์ฑ ๋ฐ ์ข ํฉ์๊ฒฌ' | |
| ] | |
| def process(self): | |
| html_contents = self._load_document() | |
| soup = HTMLParser.parse(html_contents) | |
| chapter_extractor = ChapterExtractor(soup, self.desired_keys) | |
| chapters = chapter_extractor.extract() | |
| subject_achievement = self._process_subject_achievement(chapters) | |
| course_types = self._process_course_types(subject_achievement) | |
| return chapters, subject_achievement, course_types | |
| def _load_document(self) -> str: | |
| pages = self.loader.load() | |
| return "".join(page.page_content for page in pages) | |
| def _process_subject_achievement(self, chapters: Dict[str, str]) -> Dict[str, str]: | |
| if '6. ๊ต๊ณผํ์ต๋ฐ๋ฌ์ํฉ' in chapters: | |
| extractor = SubjectAchievementExtractor(chapters['6. ๊ต๊ณผํ์ต๋ฐ๋ฌ์ํฉ']) | |
| return extractor.extract() | |
| return {} | |
| def _process_course_types(self, subject_achievement: Dict[str, str]) -> Dict[str, Dict[str, str]]: | |
| course_types = {} | |
| for grade, content in subject_achievement.items(): | |
| extractor = CourseTypeExtractor(content) | |
| course_types[grade] = extractor.extract() | |
| return course_types | |
| class CreativeExperientialActivity(BaseModel): | |
| activityType: str | |
| grade: int | |
| hours: int | |
| specialNotes: str | |
| areaOfInterest: str | |
| class AcademicAchievement(BaseModel): | |
| courseType: str | |
| grade: int | |
| semester: int | |
| subject: str | |
| course: str | |
| detailedAbilities: str | |
| class AcademicAchievementResult(BaseModel): | |
| academicAchievement: List[AcademicAchievement] | |
| class CreativeExperientialActivityResult(BaseModel): | |
| creativeExperientialActivities: List[CreativeExperientialActivity] | |
| class OcrResult(BaseModel): | |
| creativeExperientialActivities: List[CreativeExperientialActivity] | |
| academicAchievement: List[AcademicAchievement] | |
| continuation_prompt = """ | |
| ์๋ latest-response๋ ์ด์ ์๋ต์ ๋ง์ง๋ง ๋ถ๋ถ์ ๋๋ค. | |
| respond from where you left off. | |
| your response would be appended to the latest-response, so there is no need to include the latest-response in your response. | |
| your response appended to latest-response should be a full json output format. | |
| if there is new line space remove them. | |
| <latest-response> | |
| {latest-response} | |
| </latest-response> | |
| """ | |
| def clean_json_output(output): | |
| output = output.strip() | |
| if output.startswith("```json"): | |
| output = output[7:] | |
| if output.endswith("```"): | |
| output = output[:-3] | |
| cleaned_output = output.strip() | |
| return cleaned_output | |
| def get_complete_response(client, prompt, input: dict): | |
| full_response = "" | |
| original_prompt = ChatPromptTemplate.from_messages( | |
| messages=[("system", "You are a tutor for a high school student"), | |
| ("human", prompt)] | |
| ) | |
| chain = original_prompt | client | |
| response = chain.invoke(input) | |
| print(response) | |
| full_response += clean_json_output(response.content) | |
| while True: | |
| #if not response.response_metadata["stop_reason"] == "max_tokens": | |
| if not response.response_metadata["finish_reason"] == "length": | |
| break | |
| next_prompt = prompt + continuation_prompt | |
| next_prompt_message = ChatPromptTemplate.from_messages( | |
| messages=[("system", "You are a tutor for a high school student"), | |
| ("human", next_prompt)] | |
| ) | |
| chain = next_prompt_message | client | |
| input["latest-response"] = response.content | |
| response = chain.invoke(input) | |
| full_response += clean_json_output(response.content) | |
| return full_response | |
| def ocr_process(pdfFile: str): | |
| file_path = pdfFile | |
| processor = DocumentProcessor(file_path) | |
| chapters, subject_achievement, course_types = processor.process() | |
| print("Chapters:", chapters) | |
| print("Subject Achievement:", subject_achievement) | |
| print("Course Types:", course_types) | |
| claude = ChatAnthropic( | |
| model="claude-3-5-sonnet-20240620", | |
| temperature=0, | |
| max_tokens=8192, | |
| timeout=None, | |
| max_retries=2, | |
| ) | |
| creative_response = get_complete_response(gpt_4o, creative_experiential_activities_prompt, { | |
| "creative-experiential-activities-html": chapters['5. ์ฐฝ์์ ์ฒดํํ๋์ํฉ'], | |
| }) | |
| print("Response: " + creative_response) | |
| creative_result = CreativeExperientialActivityResult.model_validate(json.loads(creative_response)) | |
| creative_activities = creative_result.creativeExperientialActivities | |
| print("-----ํ๋ ๋ณ ํญ๋ชฉ๋ณ ๊ต๊ณผํ์ต๋ฐ๋ฌ์ํฉ----") | |
| print(course_types) | |
| academic_results = [] | |
| for courseType, grade_achievement in course_types.items(): | |
| for grade, grade_content in grade_achievement.items(): | |
| academic_response = get_complete_response(gpt_4o, academic_achievement_prompt,{"grade-course-type": f"{grade}_{courseType}","academic-achievement-html": grade_content} ) | |
| result = AcademicAchievementResult.model_validate(json.loads(academic_response)) | |
| print("----full response of academic_response----") | |
| print(result) | |
| for value in result.academicAchievement: | |
| academic_results.append(value) | |
| print(academic_results) | |
| print("----๊ฒฐ๊ณผ in json----") | |
| print(json.dumps({ | |
| "creative-experiential-activity": [activity.model_dump() for activity in creative_activities], | |
| "academic-achievements": [achievement.model_dump() for achievement in academic_results] | |
| }, ensure_ascii=False, indent=4)) | |
| result = OcrResult(creativeExperientialActivities=creative_activities, academicAchievement=academic_results) | |
| return result | |