<span class="kn">from</span> <span class="n">PyPDF2</span> <span class="kn">import</span> <span class="n">PdfReader</span>
<span class="k">def</span> <span class="nf">custom_rstrip</span><span class="p">(</span><span class="n">string</span><span class="p">):</span>
<span class="c1"># 半角スペースと全角スペースを除去
</span> <span class="k">return</span> <span class="n">string</span><span class="p">.</span><span class="nf">rstrip</span><span class="p">(</span><span class="sh">'</span><span class="s"> </span><span class="se">\u3000</span><span class="sh">'</span><span class="p">)</span>
<span class="c1"># PDFファイルを読み込み、処理する
</span><span class="n">file_path</span> <span class="o">=</span> <span class="sh">'</span><span class="s">01_tebiki2023bun.pdf</span><span class="sh">'</span>
<span class="n">reader</span> <span class="o">=</span> <span class="nc">PdfReader</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
<span class="c1"># セクションとそのページ番号を格納するための辞書
</span><span class="n">sections_with_pages</span> <span class="o">=</span> <span class="p">{}</span>
<span class="n">current_section</span> <span class="o">=</span> <span class="sh">''</span>
<span class="n">current_page</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">current_section_number</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">section_text</span> <span class="o">=</span> <span class="sh">''</span> <span class="c1"># セクションテキストの初期化
</span>
<span class="k">for</span> <span class="n">page</span> <span class="ow">in</span> <span class="n">reader</span><span class="p">.</span><span class="n">pages</span><span class="p">:</span>
<span class="n">current_page</span> <span class="o">+=</span> <span class="mi">1</span> <span class="c1"># ページ番号のインクリメント
</span> <span class="n">page_text</span> <span class="o">=</span> <span class="n">page</span><span class="p">.</span><span class="nf">extract_text</span><span class="p">()</span>
<span class="k">if</span> <span class="n">page_text</span><span class="p">:</span>
<span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">page_text</span><span class="p">.</span><span class="nf">split</span><span class="p">(</span><span class="sh">'</span><span class="se">\n</span><span class="sh">'</span><span class="p">):</span>
<span class="c1"># 行が全角数字で始まるかチェック
</span> <span class="k">if</span> <span class="n">line</span> <span class="ow">and</span> <span class="sh">'</span><span class="se">\uff10</span><span class="sh">'</span> <span class="o"><=</span> <span class="n">line</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o"><=</span> <span class="sh">'</span><span class="se">\uff19</span><span class="sh">'</span> <span class="ow">and</span> <span class="n">line</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="ow">in</span> <span class="p">[</span><span class="sh">'</span><span class="se">\u3000</span><span class="sh">'</span><span class="p">,</span> <span class="sh">'</span><span class="s"> </span><span class="sh">'</span><span class="p">]:</span>
<span class="k">try</span><span class="p">:</span>
<span class="c1"># 新しいセクションの開始
</span> <span class="n">current_section_name</span> <span class="o">=</span> <span class="nf">custom_rstrip</span><span class="p">(</span><span class="n">line</span><span class="p">[</span><span class="mi">2</span><span class="p">:])</span>
<span class="nf">if </span><span class="p">(</span><span class="sh">"</span><span class="s">、</span><span class="sh">"</span> <span class="ow">in</span> <span class="n">current_section_name</span> <span class="ow">or</span> <span class="sh">"</span><span class="s">。</span><span class="sh">"</span> <span class="ow">in</span> <span class="n">current_section_name</span><span class="p">)</span> <span class="ow">or</span> <span class="nf">len</span><span class="p">(</span><span class="n">current_section_name</span><span class="p">)</span> <span class="o">></span> <span class="mi">15</span><span class="p">:</span>
<span class="c1"># セクション名の条件に一致する場合、テキストを追加
</span> <span class="n">section_text</span> <span class="o">+=</span> <span class="n">line</span> <span class="o">+</span> <span class="sh">'</span><span class="se">\n</span><span class="sh">'</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">current_section</span><span class="p">:</span>
<span class="n">current_section_number</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="c1"># 前のセクションを保存
</span> <span class="n">sections_with_pages</span><span class="p">[</span><span class="n">current_section_number</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">start_page</span><span class="p">,</span> <span class="n">current_section</span><span class="p">,</span> <span class="n">current_section_name</span><span class="p">,</span> <span class="n">section_text</span><span class="p">.</span><span class="nf">replace</span><span class="p">(</span><span class="sh">"</span><span class="se">\n</span><span class="sh">"</span><span class="p">,</span> <span class="sh">""</span><span class="p">))</span>
<span class="n">current_section</span> <span class="o">=</span> <span class="n">line</span><span class="p">.</span><span class="nf">split</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># セクション名を抽出
</span> <span class="n">start_page</span> <span class="o">=</span> <span class="n">current_page</span>
<span class="n">section_text</span> <span class="o">=</span> <span class="sh">''</span>
<span class="k">except</span> <span class="nb">IndexError</span><span class="p">:</span>
<span class="k">continue</span> <span class="c1"># セクション名が存在しない場合は次の行へ
</span> <span class="k">else</span><span class="p">:</span>
<span class="c1"># 現在のセクションにテキストを追加
</span> <span class="n">section_text</span> <span class="o">+=</span> <span class="n">line</span> <span class="o">+</span> <span class="sh">'</span><span class="se">\n</span><span class="sh">'</span>
<span class="c1"># 最初の数セクションを表示
</span><span class="nf">list</span><span class="p">(</span><span class="n">sections_with_pages</span><span class="p">.</span><span class="nf">items</span><span class="p">())</span>