Spaces:

Heartsync
/

cobiz

Running

App Files Files Community

seawolf2357 commited on 18 days ago

Commit

cb1f35e

verified ·

1 Parent(s): 608dd69

Update file_api.py

Browse files

Files changed (1) hide show

file_api.py +67 -11

file_api.py CHANGED Viewed

@@ -441,9 +441,20 @@ def call_groq_api_stream(messages: List[Dict]) -> Generator[str, None, None]:
         yield f"❌ API 오류: {str(e)}"
-def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict]]:
-    """공고 상세 페이지에서 본문과 첨부파일 정보 추출"""
     try:
         headers = {
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
@@ -453,6 +464,8 @@ def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict]]:
         response.raise_for_status()
         html_text = response.text
         soup = BeautifulSoup(html_text, 'html.parser')
         content_text = ""
         tables = soup.find_all('table')
         for table in tables:
@@ -462,12 +475,19 @@ def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict]]:
         main_content = soup.find('div', {'id': 'container'}) or soup.find('main') or soup.find('article')
         if main_content and not content_text:
             content_text = main_content.get_text(separator='\n', strip=True)
-        attachments = []
         for a_tag in soup.find_all('a', href=True):
             href = a_tag.get('href', '')
             href_clean = re.sub(r';jsessionid=[^?]*', '', href)
             if 'getImageFile.do' in href_clean or 'fileDown' in href_clean or 'atchFileId' in href_clean:
                 filename = a_tag.get_text(strip=True)
                 if filename in ['다운로드', '바로보기', '내려받기', '']:
                     parent = a_tag.parent
                     if parent:
@@ -482,24 +502,60 @@ def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict]]:
                         match = re.search(r'첨부파일\s+(.+?)\s+다운로드', title)
                         if match:
                             filename = match.group(1)
                 if not filename or filename in ['다운로드', '바로보기', '내려받기']:
                     filename = f"첨부파일_{len(attachments)+1}"
                 if href_clean.startswith('/'):
                     full_url = f"https://www.bizinfo.go.kr{href_clean}"
                 elif href_clean.startswith('http'):
                     full_url = href_clean
                 else:
                     continue
                 ext = Path(filename).suffix.lower()
                 if not ext:
                     ext = '.unknown'
-                if not any(att['url'] == full_url for att in attachments):
-                    attachments.append({
-                        "filename": filename,
-                        "url": full_url,
-                        "type": ext[1:] if ext.startswith('.') else ext
-                    })
-        return content_text, attachments
     except Exception as e:
         import traceback
-        return f"상세 정보 조회 실패: {str(e)}\n{traceback.format_exc()}", []

         yield f"❌ API 오류: {str(e)}"
+def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict], Optional[Dict]]:
+    """공고 상세 페이지에서 본문, 첨부파일, 본문출력파일 정보 추출
+    Returns:
+        (content_text, attachments, print_file)
+        - content_text: 공고 본문 텍스트
+        - attachments: 일반 첨부파일 리스트 (서식, 양식 등)
+        - print_file: 본문출력파일 (공고문 PDF/HWP) - AI 분석용
+    """
     try:
+        # URL 정규화
+        if url.startswith('/'):
+            url = f"https://www.bizinfo.go.kr{url}"
         headers = {
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
         response.raise_for_status()
         html_text = response.text
         soup = BeautifulSoup(html_text, 'html.parser')
+        # 본문 텍스트 추출
         content_text = ""
         tables = soup.find_all('table')
         for table in tables:
         main_content = soup.find('div', {'id': 'container'}) or soup.find('main') or soup.find('article')
         if main_content and not content_text:
             content_text = main_content.get_text(separator='\n', strip=True)
+        attachments = []  # 일반 첨부파일
+        print_file = None  # 본문출력파일
+        # 파일 링크 추출
         for a_tag in soup.find_all('a', href=True):
             href = a_tag.get('href', '')
             href_clean = re.sub(r';jsessionid=[^?]*', '', href)
             if 'getImageFile.do' in href_clean or 'fileDown' in href_clean or 'atchFileId' in href_clean:
                 filename = a_tag.get_text(strip=True)
+                # 파일명 추출 개선
                 if filename in ['다운로드', '바로보기', '내려받기', '']:
                     parent = a_tag.parent
                     if parent:
                         match = re.search(r'첨부파일\s+(.+?)\s+다운로드', title)
                         if match:
                             filename = match.group(1)
                 if not filename or filename in ['다운로드', '바로보기', '내려받기']:
                     filename = f"첨부파일_{len(attachments)+1}"
+                # URL 정규화
                 if href_clean.startswith('/'):
                     full_url = f"https://www.bizinfo.go.kr{href_clean}"
                 elif href_clean.startswith('http'):
                     full_url = href_clean
                 else:
                     continue
                 ext = Path(filename).suffix.lower()
                 if not ext:
                     ext = '.unknown'
+                file_info = {
+                    "filename": filename,
+                    "url": full_url,
+                    "type": ext[1:] if ext.startswith('.') else ext
+                }
+                # ⭐ 본문출력파일 판별 (공고문 PDF/HWP)
+                # 패턴: "(제XXXX-XX호)", "공고", "공고문", "본문", 날짜 패턴
+                is_print_file = False
+                filename_lower = filename.lower()
+                # 1. "(제XXXX-XX호)" 패턴 - 공고번호 포함
+                if re.search(r'\(제\d+[-_]?\d*호\)', filename):
+                    is_print_file = True
+                # 2. "공고", "공고문", "모집공고" 등 키워드
+                elif any(kw in filename for kw in ['공고문', '모집공고', '공고(안)', '공고 안', '_공고_', '_공고.']):
+                    is_print_file = True
+                # 3. "본문출력" 키워드
+                elif '본문출력' in filename or '본문 출력' in filename:
+                    is_print_file = True
+                # 4. 부모 요소에 "본문출력파일" 텍스트가 있는지 확인
+                parent = a_tag.parent
+                grandparent = parent.parent if parent else None
+                for ancestor in [parent, grandparent]:
+                    if ancestor:
+                        ancestor_text = ancestor.get_text(strip=True)
+                        if '본문출력파일' in ancestor_text or '본문출력 파일' in ancestor_text:
+                            is_print_file = True
+                            break
+                if is_print_file and not print_file:
+                    print_file = file_info
+                elif not any(att['url'] == full_url for att in attachments):
+                    # "(첨부" 로 시작하는 파일은 일반 첨부파일
+                    attachments.append(file_info)
+        return content_text, attachments, print_file
     except Exception as e:
         import traceback
+        return f"상세 정보 조회 실패: {str(e)}\n{traceback.format_exc()}", [], None