bahakizil commited on
Commit
33f677a
·
verified ·
1 Parent(s): 461b947

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +240 -179
app.py CHANGED
@@ -1,213 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import gradio as gr
3
- import tiktoken
4
- import docx
5
- import PyPDF2
6
 
7
- #######################################
8
- # 1) MODEL YÜKLEME
9
- #######################################
10
- # Hugging Face Spaces'de barındırılan bir modeli "gr.load" ile çağırabilirsiniz.
11
- # Örn: model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
 
 
 
12
 
13
- model_iface = gr.load("models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
 
 
14
 
15
- def call_model(prompt: str) -> str:
16
  """
17
- Model arayüzünü (model_iface) tek satırda çağırarak sonuç döndürür.
 
 
18
  """
19
- result = model_iface(prompt)
20
- if isinstance(result, str):
21
- return result
22
- return str(result)
23
-
24
- #######################################
25
- # 2) DOSYA OKUMA (PDF/DOCX/TXT)
26
- #######################################
27
- def read_file_to_text(file_obj) -> str:
 
 
28
  """
29
- file_obj: gradio'dan gelen dosya (pdf/docx/txt).
30
- Returns: metin (str)
31
  """
32
- if file_obj is None:
33
- return ""
34
-
35
- file_path = file_obj.name
36
- # Uzantı kontrolü
37
- _, ext = os.path.splitext(file_path)
38
- ext = ext.lower()
39
-
40
- if ext == ".pdf":
41
- return read_pdf(file_path)
42
- elif ext == ".docx":
43
- return read_docx(file_path)
44
- elif ext == ".txt":
45
- return read_txt(file_path)
46
- else:
47
- # Bilinmeyen format - basitçe hata ya da boş dönebilir
48
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def read_pdf(file_path: str) -> str:
 
51
  text = ""
52
  with open(file_path, "rb") as f:
53
- reader = PyPDF2.PdfReader(f)
54
  for page in reader.pages:
55
- text += page.extract_text() + "\n"
 
 
56
  return text
57
 
58
  def read_docx(file_path: str) -> str:
59
- doc = docx.Document(file_path)
60
- full_text = []
 
61
  for para in doc.paragraphs:
62
- full_text.append(para.text)
63
- return "\n".join(full_text)
64
 
65
  def read_txt(file_path: str) -> str:
 
66
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
67
  return f.read()
68
 
69
- #######################################
70
- # 3) TIKTOKEN CHUNK
71
- #######################################
72
- def chunk_text_with_tiktoken(text: str, chunk_size=500, model_name="gpt-3.5-turbo"):
73
  """
74
- text'i 'chunk_size' token uzunluklarında parçalara böler (token bazlı).
 
 
75
  """
76
- encoding = tiktoken.encoding_for_model(model_name)
77
- tokens = encoding.encode(text)
78
- chunks = []
79
- for i in range(0, len(tokens), chunk_size):
80
- sub_tokens = tokens[i:i+chunk_size]
81
- chunk_str = encoding.decode(sub_tokens)
82
- chunks.append(chunk_str)
83
- return chunks
84
-
85
- #######################################
86
- # 4) 11 CHUNK: 4 HEADING + 3 VALIDATION
87
- #######################################
88
- def generate_4_headings_3_validation(full_text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
89
  """
90
- 4 heading (her heading 2 chunk: üretici + kontrol = 8) + 3 validation = 11 chunk
 
 
 
 
91
  """
 
 
92
 
93
- final_output = ""
 
94
 
95
- # ========== HEADING 1 ==========
96
- # 1) üretici
97
- h1_prod = call_model(
98
- f"[HEADING 1 PRODUCTION]\n"
99
- f"Input:\n{full_text}\n"
100
- "Task: 'Heading 1: Introductory overview' with 3000-6000 chars."
101
- )
102
- # 2) kontrol
103
- h1_ctrl = call_model(
104
- f"[HEADING 1 CONTROL]\n"
105
- f"H1 Production:\n{h1_prod}\n"
106
- "Check 3000-6000 chars, fix if needed."
107
- )
108
- final_output += f"<b>HEADING 1 (Corrected)</b><hr>\n{h1_ctrl}\n\n"
109
-
110
- # ========== HEADING 2 ==========
111
- # 3) üretici
112
- h2_prod = call_model(
113
- f"[HEADING 2 PRODUCTION]\n"
114
- f"Input:\n{full_text}\n"
115
- "Task: 'Heading 2: Detailed explanation of common risks' with 500-1200 chars."
116
- )
117
- # 4) kontrol
118
- h2_ctrl = call_model(
119
- f"[HEADING 2 CONTROL]\n"
120
- f"H2 Production:\n{h2_prod}\n"
121
- "Check 500-1200 chars, fix if needed."
122
- )
123
- final_output += f"<b>HEADING 2 (Corrected)</b><hr>\n{h2_ctrl}\n\n"
124
-
125
- # ========== HEADING 3 ==========
126
- # 5) üretici
127
- h3_prod = call_model(
128
- f"[HEADING 3 PRODUCTION]\n"
129
- f"Input:\n{full_text}\n"
130
- "Task: 'Heading 3: Practical examples and solutions' with 500-1200 chars."
131
- )
132
- # 6) kontrol
133
- h3_ctrl = call_model(
134
- f"[HEADING 3 CONTROL]\n"
135
- f"H3 Production:\n{h3_prod}\n"
136
- "Check 500-1200 chars, fix if needed."
137
- )
138
- final_output += f"<b>HEADING 3 (Corrected)</b><hr>\n{h3_ctrl}\n\n"
139
-
140
- # ========== HEADING 4 ==========
141
- # 7) üretici
142
- h4_prod = call_model(
143
- f"[HEADING 4 PRODUCTION]\n"
144
- f"Input:\n{full_text}\n"
145
- "Task: 'Heading 4: Summary and next steps for students' with 500-1200 chars."
146
- )
147
- # 8) kontrol
148
- h4_ctrl = call_model(
149
- f"[HEADING 4 CONTROL]\n"
150
- f"H4 Production:\n{h4_prod}\n"
151
- "Check 500-1200 chars, fix if needed."
152
- )
153
- final_output += f"<b>HEADING 4 (Corrected)</b><hr>\n{h4_ctrl}\n\n"
154
-
155
- # ========== 3 VALIDATION CHUNK ==========
156
- current_text = final_output
157
- for i in range(1, 4):
158
- validation_out = call_model(
159
- f"[VALIDATION #{i}]\n"
160
- f"Current text:\n{current_text}\n"
161
- "Check headings' constraints. If fixes needed, do them. Otherwise 'No changes needed.'"
162
- )
163
- current_text = validation_out
164
 
165
- return current_text
 
166
 
167
- #######################################
168
- # 5) GRADIO ARAYÜZ FONKSİYONU
169
- #######################################
170
- def main_interface(file, manual_text, chunk_size):
171
- """
172
- file: Yüklenen dosya (PDF/DOCX/TXT)
173
- manual_text: Kullanıcının girdiği ham metin
174
- chunk_size: Tiktoken chunk uzunluğu
175
- """
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- # 1) Dosya varsa, ondan metin çekelim
178
- doc_text = read_file_to_text(file)
179
-
180
- # 2) Metni oluştur -> file metni + manual_text
181
- combined_text = (doc_text + "\n" + manual_text).strip()
182
- if not combined_text:
183
- return "No input text found."
184
-
185
- # 3) Tiktoken chunk
186
- chunks = chunk_text_with_tiktoken(combined_text, chunk_size=chunk_size)
187
-
188
- # 4) Tüm chunk'ları birleştirip (veya isterseniz parça parça da işleyebilirsiniz),
189
- # 11-chunk mantığına sokalım
190
- full_text = "\n".join(chunks)
191
- final_output = generate_4_headings_3_validation(full_text)
192
-
193
- return final_output.replace("\n", "<br>")
194
-
195
- #######################################
196
- # 6) GRADIO ARAYÜZ TANIMI
197
- #######################################
198
- demo = gr.Interface(
199
- fn=main_interface,
200
- inputs=[
201
- gr.File(label="Upload PDF/DOCX/TXT (optional)"),
202
- gr.Textbox(lines=5, label="Or Paste Some Text"),
203
- gr.Slider(minimum=100, maximum=2000, step=100, value=500, label="Chunk Size (tokens)")
204
- ],
205
- outputs="html",
206
- title="PDF/DOCX + Tiktoken + 4 Heading + 3 Validation (11 Chunk)"
207
- )
208
-
209
- def run():
210
- demo.launch()
211
 
 
212
  if __name__ == "__main__":
213
- run()
 
 
 
 
1
+ # app.py
2
+ # --------------------------------------------------------------------------------
3
+ # Bu kod, tamamen geliştirici (insan) tarafından, öğretici ve eğitim amacıyla
4
+ # yazılmıştır. GPT-4o-mini modelini kullanarak 4 başlık + 1 kontrol chunk (5 chunk)
5
+ # şeklinde metin oluşturma akışını gösterir. Minimum 4000, maksimum 10000 kelime
6
+ # üretilmesi hedeflenir. Kod, Gradio ile görsel bir arayüz sunar.
7
+ #
8
+ # NOT: Lütfen 'YOUR_API_KEY_HERE' kısmına kendi OpenAI API anahtarınızı ekleyin.
9
+ # Bu kodda max_tokens 10,000, temperature 0.8 kullanarak uzun ve yaratıcı çıktılar
10
+ # elde etmeyi amaçlıyoruz.
11
+ #
12
+ # Bu proje tamamen insan emeğiyle yazılmıştır, geliştirici tarafından tasarlanmıştır.
13
+ # --------------------------------------------------------------------------------
14
+
15
  import os
16
+ import re
17
  import gradio as gr
 
 
 
18
 
19
+ # Ek kütüphaneler
20
+ try:
21
+ from openai import OpenAI
22
+ import tiktoken
23
+ from PyPDF2 import PdfReader
24
+ from docx import Document
25
+ except ImportError:
26
+ raise ImportError("Lütfen 'openai', 'tiktoken', 'gradio', 'PyPDF2', 'python-docx' paketlerini kurun.")
27
 
28
+ # -------------------------- OpenAI Ayarları --------------------------
29
+ # GPT-4o-mini modelini kullanacağımız API istemcisi:
30
+ client = OpenAI(api_key="YOUR_API_KEY_HERE")
31
 
32
+ def call_openai_chat(messages, max_tokens=10000, temperature=0.8):
33
  """
34
+ GPT-4o-mini modeline istek atar.
35
+ - max_tokens=10000 -> uzun metinler
36
+ - temperature=0.8 -> daha yaratıcı/uzun anlatımlar
37
  """
38
+ response = client.chat.completions.create(
39
+ model="gpt-4o-mini",
40
+ messages=messages,
41
+ max_tokens=max_tokens,
42
+ temperature=temperature,
43
+ stop=None # Erken kesmeyi kapatalım
44
+ )
45
+ return response.choices[0].message.content
46
+
47
+ # ------------------------- Chunk Mantığı -------------------------
48
+ def heading1_part1(input_text):
49
  """
50
+ Chunk #1 -> Heading 1'in ilk parçası.
51
+ Kullanıcıdan alınan metin ile kısmi bir "Introductory overview" üretir.
52
  """
53
+ user_content = f"""
54
+ We have some input text. We want the first part of 'Heading 1: Introductory overview of input'.
55
+ Please produce a partial text focusing on an introduction (about 1000+ words).
56
+ Do NOT finalize heading 1 yet, just a partial introduction.
57
+
58
+ Input text:
59
+ {input_text}
60
+ """
61
+ messages = [
62
+ {"role": "system", "content": "You are a helpful assistant generating partial text for heading #1."},
63
+ {"role": "user", "content": user_content}
64
+ ]
65
+ return call_openai_chat(messages)
66
+
67
+ def heading1_part2(h1_part1_text):
68
+ """
69
+ Chunk #2 -> Heading 1'in ikinci parçası.
70
+ Ilk parçayı genişleterek final haline getirir (örn. 2000+ words).
71
+ """
72
+ user_content = f"""
73
+ Below is the partial text for heading 1:
74
+ {h1_part1_text}
75
+
76
+ Now finalize heading 1 by merging expansions or clarifications.
77
+ Ensure heading 1 is at least 2000 words in total. Add depth and examples.
78
+ Return only the final text for heading 1.
79
+ """
80
+ messages = [
81
+ {"role": "system", "content": "You are finalizing heading #1."},
82
+ {"role": "user", "content": user_content}
83
+ ]
84
+ return call_openai_chat(messages)
85
+
86
+ def single_heading_chunk(existing_text, heading_title):
87
+ """
88
+ Chunk #3 veya #4 -> Heading 2 veya Heading 3. Tek seferde ~1000 kelime oluşturmayı hedefleyelim.
89
+ existing_text: heading1_text, vs. referans olarak kullanılabilir.
90
+ """
91
+ user_content = f"""
92
+ We have some text for context (heading1 or previous content).
93
+ Please produce a new heading: '{heading_title}' with around 1000+ words if possible.
94
+ Do not produce final expansions for other headings.
95
+
96
+ Context:
97
+ {existing_text}
98
+ """
99
+ messages = [
100
+ {"role": "system", "content": "You are generating a single-chunk heading text."},
101
+ {"role": "user", "content": user_content}
102
+ ]
103
+ return call_openai_chat(messages)
104
+
105
+ def heading4_and_expansions(heading1_text, heading2_text, heading3_text, input_text):
106
+ """
107
+ Chunk #5 -> Heading 4, expansions if total <4000 words, or shorten if >10000 words.
108
+ Tek seferde final text döndürür.
109
+ """
110
+ user_prompt = f"""
111
+ We have 3 headings so far:
112
+
113
+ [Heading 1]
114
+ {heading1_text}
115
 
116
+ [Heading 2]
117
+ {heading2_text}
118
+
119
+ [Heading 3]
120
+ (Will be produced next, or we have it if created)
121
+
122
+ Actually, produce Heading 4: 'Summary and next steps for students.'
123
+ Then combine headings 1,2,3,4 into one final text.
124
+ If the entire text (4 headings) is under 4000 words, expand or add content
125
+ to any heading until we reach 4000+ words.
126
+ If above 10000 words, shorten while keeping crucial details.
127
+ Return the final text with headings 1,2,3,4 merged.
128
+ No separate block, but unify expansions or edits.
129
+
130
+ You can also use original input context:
131
+ {input_text}
132
+ """
133
+ messages = [
134
+ {"role": "system", "content": "You are finalizing heading #4 and ensuring total word count 4000-10000."},
135
+ {"role": "user", "content": user_prompt}
136
+ ]
137
+ return call_openai_chat(messages)
138
+
139
+ # -------------------- Dosya Okuma Yardımcı Fonksiyonlar --------------------
140
  def read_pdf(file_path: str) -> str:
141
+ """Reads text from a PDF file (simple approach)."""
142
  text = ""
143
  with open(file_path, "rb") as f:
144
+ reader = PdfReader(f)
145
  for page in reader.pages:
146
+ page_txt = page.extract_text()
147
+ if page_txt:
148
+ text += page_txt
149
  return text
150
 
151
  def read_docx(file_path: str) -> str:
152
+ """Reads text from a DOCX file."""
153
+ doc = Document(file_path)
154
+ paragraphs = []
155
  for para in doc.paragraphs:
156
+ paragraphs.append(para.text)
157
+ return "\n".join(paragraphs)
158
 
159
  def read_txt(file_path: str) -> str:
160
+ """Reads text from a .txt file."""
161
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
162
  return f.read()
163
 
164
+ # --------------- Gradio Arayüz Fonksiyonları ---------------
165
+ def process_input_text_or_file(txt_input, file_obj):
 
 
166
  """
167
+ Okunan metni döndürür.
168
+ txt_input: text (str)
169
+ file_obj: gradio üzerinden gelen file nesnesi
170
  """
171
+ # Eğer dosya yüklenmişse
172
+ if file_obj is not None:
173
+ # file_obj genelde (name, size, data vb.) barındırır.
174
+ file_name = file_obj.name
175
+ content = file_obj.read() # raw bytes
176
+
177
+ with open(file_name, "wb") as tmp:
178
+ tmp.write(content)
179
+
180
+ ext = file_name.lower().split(".")[-1]
181
+ if ext == "pdf":
182
+ return read_pdf(file_name)
183
+ elif ext == "docx":
184
+ return read_docx(file_name)
185
+ elif ext == "txt":
186
+ return read_txt(file_name)
187
+ else:
188
+ # fallback decode
189
+ return content.decode("utf-8", errors="ignore")
190
+ else:
191
+ # Dosya yoksa, metin kutusunu döndür
192
+ return txt_input.strip()
193
+
194
+ def generate_5_chunks(input_txt):
195
  """
196
+ 1) Heading1 part1 (chunk #1)
197
+ 2) Heading1 part2 (chunk #2)
198
+ 3) Heading2 (chunk #3)
199
+ 4) Heading3 (chunk #4)
200
+ 5) Heading4 + expansions => final text (chunk #5)
201
  """
202
+ # Chunk #1: heading1 part1
203
+ h1_part1 = heading1_part1(input_txt)
204
 
205
+ # Chunk #2: heading1 part2 => finalize heading 1
206
+ heading1_final = heading1_part2(h1_part1)
207
 
208
+ # Chunk #3: heading2
209
+ heading2_final = single_heading_chunk(heading1_final, "Heading 2: Detailed explanation of common risks.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ # Chunk #4: heading3
212
+ heading3_final = single_heading_chunk(heading1_final, "Heading 3: Practical examples and solutions.")
213
 
214
+ # Chunk #5: heading4 + expansions
215
+ final_text = heading4_and_expansions(heading1_final, heading2_final, heading3_final, input_txt)
216
+
217
+ # HTML için .replace
218
+ final_html = final_text.replace("\n", "<br>")
219
+ # Kelime sayısı
220
+ plain_text = re.sub(r"<.*?>", "", final_text)
221
+ wcount = len(plain_text.split())
222
+
223
+ # Sonuç
224
+ info = f"✅ Done. The final text is approx {wcount} words."
225
+ return final_html, info
226
+
227
+ def gradio_interface(txt_input, file_upload):
228
+ # Tek fonksiyon, hem input hem output
229
+ read_content = process_input_text_or_file(txt_input, file_upload)
230
+ if not read_content:
231
+ return "⚠️ Please provide text or file input.", ""
232
+ # 5-chunk workflow
233
+ final_html, info = generate_5_chunks(read_content)
234
+ return final_html, info
235
 
236
+ # --------------- Gradio Demo ---------------
237
+ def build_gradio_app():
238
+ # "inputs" parametresine, txt ve file girişi ekleyeceğiz
239
+ text_input = gr.Textbox(
240
+ lines=5,
241
+ label="Text Input (Optional)",
242
+ placeholder="Enter some text or upload a file..."
243
+ )
244
+ file_input = gr.File(
245
+ label="Upload File (PDF/DOCX/TXT)",
246
+ file_types=[".pdf", ".docx", ".txt"],
247
+ optional=True
248
+ )
249
+ # outputs: final HTML + info
250
+ output_html = gr.HTML(label="Generated Output (Min 4000 words, Max 10000 words)")
251
+ info_label = gr.Label(label="Process Info (Word Count etc.)")
252
+
253
+ # Arayüz
254
+ demo = gr.Interface(
255
+ fn=gradio_interface,
256
+ inputs=[text_input, file_input],
257
+ outputs=[output_html, info_label],
258
+ title="5-Chunks GPT-4o-mini (4000-10000 words) Example",
259
+ description=(
260
+ "A demonstration of chunk-based approach with GPT-4o-mini model. "
261
+ "We produce 4 headings: "
262
+ "Heading1(part1+part2), Heading2, Heading3, and then Heading4 & expansions "
263
+ "if total words < 4000 or shorten if > 10000."
264
+ "\n(Coded by a human developer, not AI. For educational purposes.)"
265
+ )
266
+ )
267
+ return demo
 
 
268
 
269
+ # app.py main
270
  if __name__ == "__main__":
271
+ # Gradio app
272
+ demo_app = build_gradio_app()
273
+ # genelde local (127.0.0.1:7860) host
274
+ demo_app.launch()