ReadTagsTableData.cs
//
// This code is part of GrapeCity Documents for PDF samples.
// Copyright (c) GrapeCity, Inc. All rights reserved.
//
using System;
using System.IO;
using System.Drawing;
using System.Linq;
using System.Collections.Generic;
using GrapeCity.Documents.Pdf;
using GrapeCity.Documents.Text;
using GrapeCity.Documents.Pdf.TextMap;
using GrapeCity.Documents.Pdf.Structure;
using GrapeCity.Documents.Pdf.Recognition.Structure;

namespace GcPdfWeb.Samples
{
    // Find tables and read their data using structure tags.
    public class ReadTagsTableData
    {
        private TextFormat _tf, _tfHdr, _tfPgHdr;
        private float _margin = 72;

        public void CreatePDF(Stream stream)
        {
            // Set up some text formats:
            _tf = new TextFormat()
            {
                Font = Font.FromFile(Path.Combine("Resources", "Fonts", "segoeui.ttf")),
                FontSize = 9,
                ForeColor = Color.Black
            };
            _tfHdr = new TextFormat(_tf)
            {
                Font = Font.FromFile(Path.Combine("Resources", "Fonts", "segoeuib.ttf")),
                FontSize = 11,
                ForeColor = Color.DarkBlue
            };
            _tfPgHdr = new TextFormat(_tf)
            {
                FontSize = 11,
                ForeColor = Color.Gray
            };

            // The resulting PDF:
            GcPdfDocument doc = new GcPdfDocument();
            using (var s = File.OpenRead(Path.Combine("Resources", "PDFs", "C1Olap-QuickStart.pdf")))
            {
                var source = new GcPdfDocument();
                source.Load(s);
                PrintAllTables(doc, source);
            }
            // Save the PDF:
            doc.Save(stream);
        }

        private void PrintAllTables(GcPdfDocument doc, GcPdfDocument source)
        {
            // Get the LogicalStructure and top parent element:
            LogicalStructure ls = source.GetLogicalStructure();
            if (ls == null || ls.Elements == null || ls.Elements.Count == 0)
            {
                // No structure tags found:
                Common.Util.AddNote("No structure tags were found in the source document.", doc.Pages.Add());
                return;
            }
            // The root element:
            Element root = ls.Elements[0];

            // Find and print all tables:
            var tables = new List<(TextLayout, Page)>();
            root.Children.FindAll(e_ => e_.StructElement.Type == "Table").ForEach(t_ => tables.Add(PrintTable(t_)));
            // Group tables by the page they were found on:
            var tablesByPage = tables.GroupBy(t_ => t_.Item2.Index);
            // For each page, print all tables found on that page,
            // followed by the original page for reference:
            foreach (var tbp in tablesByPage)
            {
                // The page that will contain the extracted table data:
                var pgTables = doc.NewPage();
                // The page that will contain the source page for reference:
                var pgSrc = doc.NewPage();
                // Print the original page:
                tbp.First().Item2.Draw(pgSrc.Graphics, pgSrc.Bounds);
                // Add a page header:
                pgSrc.Graphics.DrawString($"Page {tbp.First().Item2.Index + 1} of the source PDF",
                    _tfPgHdr, new RectangleF(0, 0, pgSrc.Size.Width, _margin), TextAlignment.Center, ParagraphAlignment.Center, false);
                //
                float maxHeight = pgTables.Size.Height - _margin * 2;
                float y = _margin;
                // Print all table data. For simplicity sake we assume that all table data will fit on a single page:
                foreach (var t in tbp)
                {
                    t.Item1.MaxHeight = maxHeight;
                    t.Item1.MaxWidth = pgTables.Size.Width - _margin * 2;
                    pgTables.Graphics.DrawTextLayout(t.Item1, new PointF(_margin, y));
                    maxHeight -= t.Item1.ContentHeight + _margin;
                    y += t.Item1.ContentHeight + _margin;
                }
            }
        }

        private (TextLayout, Page) PrintTable(Element e)
        {
            if (e.Type != "Table")
                throw new Exception($"Unexpected: element type must be 'Table' but it is '{e.Type}'.");

            List<List<IList<ITextParagraph>>> table = new List<List<IList<ITextParagraph>>>();
            int maxCols = 0;
            // Select all child elements with type TR - table rows:
            void SelectRows(IList<Element> elements)
            {
                foreach (Element ec in elements)
                {
                    if (ec.HasChildren)
                    {
                        if (ec.StructElement.Type == "TR")
                        {
                            var cells = ec.Children.FindAll((e_) => e_.StructElement.Type == "TD").ToArray();
                            maxCols = Math.Max(maxCols, cells.Length);
                            List<IList<ITextParagraph>> tableCells = new List<IList<ITextParagraph>>();
                            foreach (var cell in cells)
                                tableCells.Add(cell.GetParagraphs());
                            table.Add(tableCells);
                        }
                        else
                            SelectRows(ec.Children);
                    }
                }
            }
            SelectRows(e.Children);

            // show table
            var sourcePage = FindPage(e.StructElement);
            if (sourcePage == null)
                throw new Exception("Unexpected: could not find the default page for the table.");

            var tl = new TextLayout(72);

            // Add table data to the text layout:
            tl.Append($"\nTable on page {sourcePage.Index + 1} of the source document has {maxCols} column(s) and {table.Count} row(s).\nData by row:", _tfHdr);
            tl.AppendParagraphBreak();
            int irow = 0;
            foreach (var row in table)
            {
                int icol = 0;
                foreach (var cell in row)
                {
                    foreach (var para in cell)
                    {
                        tl.Append(para.GetText());
                    }
                    if (row.IndexOf(cell) <= row.Count)
                        tl.Append("\t");
                    else
                        tl.AppendLine();
                    ++icol;
                }
                ++irow;
                tl.AppendLine();
            }
            return (tl, sourcePage);
        }

        private Page FindPage(StructElement se)
        {
            if (se.DefaultPage != null)
                return se.DefaultPage;
            if (se.HasChildren)
                foreach (var child in se.Children)
                {
                    var p = FindPage(child);
                    if (p != null)
                        return p;
                }
            return null;
        }
    }
}