Spaces:
Sleeping
Sleeping
| from pptx import Presentation | |
| from docx import Document | |
| import pdfplumber | |
| from reportlab.lib.pagesizes import letter | |
| from pathlib import Path | |
| import re | |
| import pandas as pd | |
| import pdfplumber | |
| from concurrent.futures import ThreadPoolExecutor | |
| from reportlab.platypus import SimpleDocTemplate, Paragraph | |
| from reportlab.lib.styles import getSampleStyleSheet | |
| def extract_text(file_path: Path, file_type: str) -> str: | |
| text = "" | |
| if file_type == "txt": | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| elif file_type == "docx": | |
| doc = Document(file_path) | |
| return "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) | |
| elif file_type == "xlsx": | |
| df = pd.read_excel(file_path, engine="openpyxl") | |
| return df.to_string(index=False, header=False).strip() | |
| elif file_type == "pptx": | |
| prs = Presentation(file_path) | |
| text_parts = [] | |
| for slide in prs.slides: | |
| for shape in slide.shapes: | |
| if shape.has_text_frame: | |
| text_parts.append(shape.text_frame.text.strip()) | |
| elif shape.has_table: | |
| for row in shape.table.rows: | |
| for cell in row.cells: | |
| if cell.text.strip(): | |
| text_parts.append(cell.text.strip()) | |
| return "\n".join(text_parts) | |
| elif file_type == "pdf": | |
| with pdfplumber.open(file_path) as pdf: | |
| def extract_page(page): | |
| return page.extract_text_simple() or "" | |
| with ThreadPoolExecutor() as executor: | |
| text_parts = list(executor.map(extract_page, pdf.pages)) | |
| return "\n".join(part for part in text_parts if part).strip() | |
| return text | |
| def save_file(text: str, file_type: str, output_path: Path): | |
| if file_type == "docx": | |
| doc = Document() | |
| doc.add_paragraph(text) | |
| doc.save(output_path) | |
| elif file_type == "xlsx": | |
| df = pd.DataFrame(text.split("\n"), columns=["Content"]) | |
| df.to_excel(output_path, index=False, engine="xlsxwriter") | |
| elif file_type == "pptx": | |
| prs = Presentation() | |
| slide_layout = prs.slide_layouts[1] | |
| text_lines = text.split('\n') | |
| chunks = [text_lines[i:i+25] for i in range(0, len(text_lines), 25)] | |
| for chunk in chunks: | |
| slide = prs.slides.add_slide(slide_layout) | |
| content = slide.shapes.placeholders[1] | |
| text_frame = content.text_frame | |
| text_frame.clear() | |
| text_frame.text = "\n".join(chunk) | |
| prs.save(output_path) | |
| elif file_type == "pdf": | |
| doc = SimpleDocTemplate(str(output_path), pagesize=letter) | |
| styles = getSampleStyleSheet() | |
| flowables = [Paragraph(line, styles["Normal"]) for line in text.split("\n") if line.strip()] | |
| doc.build(flowables) | |
| else: | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(text) | |
| def verify_summary(summary: str, original: str) -> str: | |
| """Simplified verification using word matching""" | |
| original_lower = original.lower() | |
| verified = [] | |
| for sentence in summary.split('.'): | |
| sentence = sentence.strip() | |
| if not sentence: | |
| continue | |
| # Count matching words (minimum 3 letters) | |
| matches = 0 | |
| total_words = 0 | |
| for word in sentence.lower().split(): | |
| if len(word) >= 3 and word in original_lower: | |
| matches += 1 | |
| total_words += 1 | |
| # Keep sentence if at least 30% of significant words match | |
| if total_words > 0 and (matches / total_words) >= 0.3: | |
| verified.append(sentence) | |
| return '. '.join(verified) if verified else summary[:500] | |
| def ensure_complete_sentences(text: str) -> str: | |
| if not text or not isinstance(text, str): | |
| return "" | |
| try: | |
| text = ' '.join(text.split()) | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| valid_sentences = [ | |
| s.strip() for s in sentences | |
| if s.strip() and s[-1] in {'.', '!', '?'} | |
| ] | |
| reconstructed = ' '.join(valid_sentences) | |
| # Final safety check | |
| if not reconstructed.endswith(('.', '!', '?')): | |
| last_break = max( | |
| reconstructed.rfind('.'), | |
| reconstructed.rfind('!'), | |
| reconstructed.rfind('?') | |
| ) | |
| if last_break > 0: | |
| reconstructed = reconstructed[:last_break + 1] | |
| else: | |
| reconstructed = reconstructed + '.' if reconstructed else "" | |
| return reconstructed | |
| except Exception: | |
| return text |