Spaces:
Sleeping
Sleeping
app.py
CHANGED
|
@@ -23,7 +23,9 @@ nest_asyncio.apply()
|
|
| 23 |
generator = pipeline("text-generation",
|
| 24 |
model="unsloth/gemma-3-1b-it",
|
| 25 |
device_map='cpu',
|
| 26 |
-
max_new_tokens=
|
|
|
|
|
|
|
| 27 |
# Async function to get voices
|
| 28 |
async def get_english_voices():
|
| 29 |
voices = await VoicesManager.create()
|
|
@@ -56,7 +58,7 @@ KEY_TERMS = [
|
|
| 56 |
def split_sentences(text):
|
| 57 |
return re.split(r'(?<=[.!?])\s+', text.strip())
|
| 58 |
|
| 59 |
-
def
|
| 60 |
reader = PdfReader(pdf_path)
|
| 61 |
full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|
| 62 |
full_text = re.sub(r'\n+', '\n', full_text)
|
|
@@ -69,12 +71,7 @@ def extract_sections_from_pdf_old(pdf_path):
|
|
| 69 |
"third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
|
| 70 |
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
| 71 |
}
|
| 72 |
-
|
| 73 |
-
"Start of podcast with first section of paper as abstract": r"^abstract\b",
|
| 74 |
-
"second section continuing from abstract to overview and no required to start introductuion between host & guest directly continue in discussion": r"^introduction\b|^overview\b",
|
| 75 |
-
"third section continuing from overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"^method(?:ology)?\b|^proposed method\b|^approach\b|^model architecture\b|^experimental setup\b|^network design\b",
|
| 76 |
-
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"^conclusion(?:s)?\b|^summary\b|^final thought(?:s)\b|^result(?:s)\b",
|
| 77 |
-
}
|
| 78 |
|
| 79 |
sections = {}
|
| 80 |
matches = []
|
|
@@ -96,66 +93,7 @@ def extract_sections_from_pdf_old(pdf_path):
|
|
| 96 |
|
| 97 |
return sections,section_patterns
|
| 98 |
|
| 99 |
-
# Define heading regex patterns
|
| 100 |
-
SECTION_LABELS = {
|
| 101 |
-
"abstract": r"\babstract\b",
|
| 102 |
-
"introduction": r"\bintroduction\b",
|
| 103 |
-
"methodology": r"\b(method(?:ology)?|approach|model architecture|implementation|framework|experimental setup)\b",
|
| 104 |
-
"conclusion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
| 105 |
-
}
|
| 106 |
|
| 107 |
-
def is_heading(span):
|
| 108 |
-
"""Heuristic: if text is bold or font size is large, consider it heading"""
|
| 109 |
-
return span['size'] > 11 and span['font'].lower().find("bold") != -1
|
| 110 |
-
|
| 111 |
-
def clean_text(text):
|
| 112 |
-
return re.sub(r'\s+', ' ', text.strip())
|
| 113 |
-
|
| 114 |
-
def extract_sections_from_pdf(pdf_path):
|
| 115 |
-
doc = fitz.open(pdf_path)
|
| 116 |
-
|
| 117 |
-
headings = []
|
| 118 |
-
paragraphs = []
|
| 119 |
-
section_text_map = {}
|
| 120 |
-
|
| 121 |
-
# Extract headings and text blocks
|
| 122 |
-
for page in doc:
|
| 123 |
-
blocks = page.get_text("dict")["blocks"]
|
| 124 |
-
for block in blocks:
|
| 125 |
-
for line in block.get("lines", []):
|
| 126 |
-
for span in line["spans"]:
|
| 127 |
-
txt = clean_text(span["text"])
|
| 128 |
-
if len(txt) == 0:
|
| 129 |
-
continue
|
| 130 |
-
if is_heading(span):
|
| 131 |
-
headings.append((txt, page.number))
|
| 132 |
-
else:
|
| 133 |
-
paragraphs.append((txt, page.number))
|
| 134 |
-
|
| 135 |
-
# Identify section labels via regex
|
| 136 |
-
labeled_headings = []
|
| 137 |
-
for txt, page in headings:
|
| 138 |
-
for label, pattern in SECTION_LABELS.items():
|
| 139 |
-
if re.search(pattern, txt, re.IGNORECASE):
|
| 140 |
-
labeled_headings.append((label, txt, page))
|
| 141 |
-
|
| 142 |
-
# Sort labeled headings by page number
|
| 143 |
-
labeled_headings.sort(key=lambda x: x[2])
|
| 144 |
-
|
| 145 |
-
# Slice paragraphs by heading regions
|
| 146 |
-
for i, (label, _, start_page) in enumerate(labeled_headings):
|
| 147 |
-
end_page = labeled_headings[i + 1][2] if i + 1 < len(labeled_headings) else doc.page_count
|
| 148 |
-
|
| 149 |
-
# Filter relevant paragraphs
|
| 150 |
-
section_paras = [
|
| 151 |
-
p[0] for p in paragraphs if start_page <= p[1] < end_page
|
| 152 |
-
]
|
| 153 |
-
|
| 154 |
-
# Limit by 3–5 paragraphs for summarization efficiency
|
| 155 |
-
limited_text = "\n".join(section_paras[:5])
|
| 156 |
-
section_text_map[label] = limited_text
|
| 157 |
-
|
| 158 |
-
return section_text_map,SECTION_LABELS
|
| 159 |
|
| 160 |
def extract_paragraphs(text, max_paragraphs=4):
|
| 161 |
# Use double newlines if present
|
|
@@ -264,11 +202,8 @@ async def tts_edge_line_by_line(script):
|
|
| 264 |
print(f"⚠️ Skipping corrupt or empty file: {filename}")
|
| 265 |
continue
|
| 266 |
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
segments.append(segment)
|
| 270 |
-
except CouldntDecodeError as e:
|
| 271 |
-
print(f"❌ Error decoding {filename}: {e}")
|
| 272 |
|
| 273 |
return segments
|
| 274 |
|
|
|
|
| 23 |
generator = pipeline("text-generation",
|
| 24 |
model="unsloth/gemma-3-1b-it",
|
| 25 |
device_map='cpu',
|
| 26 |
+
max_new_tokens=350,
|
| 27 |
+
do_sample=True,
|
| 28 |
+
temperature=0.7,)
|
| 29 |
# Async function to get voices
|
| 30 |
async def get_english_voices():
|
| 31 |
voices = await VoicesManager.create()
|
|
|
|
| 58 |
def split_sentences(text):
|
| 59 |
return re.split(r'(?<=[.!?])\s+', text.strip())
|
| 60 |
|
| 61 |
+
def extract_sections_from_pdf(pdf_path):
|
| 62 |
reader = PdfReader(pdf_path)
|
| 63 |
full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|
| 64 |
full_text = re.sub(r'\n+', '\n', full_text)
|
|
|
|
| 71 |
"third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
|
| 72 |
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
| 73 |
}
|
| 74 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
sections = {}
|
| 77 |
matches = []
|
|
|
|
| 93 |
|
| 94 |
return sections,section_patterns
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
def extract_paragraphs(text, max_paragraphs=4):
|
| 99 |
# Use double newlines if present
|
|
|
|
| 202 |
print(f"⚠️ Skipping corrupt or empty file: {filename}")
|
| 203 |
continue
|
| 204 |
|
| 205 |
+
segment = AudioSegment.from_mp3(filename)
|
| 206 |
+
segments.append(segment)
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
return segments
|
| 209 |
|