WordIndex.vb
''
'' This code is part of Document Solutions for PDF demos.
'' Copyright (c) MESCIUS inc. All rights reserved.
''
Imports System.IO
Imports System.Drawing
Imports GrapeCity.Documents.Pdf
Imports GrapeCity.Documents.Pdf.TextMap
Imports GrapeCity.Documents.Text
Imports GrapeCity.Documents.Common
Imports GrapeCity.Documents.Pdf.Annotations

'' This sample loads an existing PDF, and imports a predefined list of key words,
'' builds an alphabetical index of those words linked to pages where they occur
'' in the document. The generated index pages are appended to the original document,
'' and saved in a new PDF.
'' The index is rendered in two balanced columns, imports the technique
'' demonstrated in the BalancedColumns sample.
''
'' NOTE: if you download this sample and run it locally on your own system 
'' without a valid DsPdf license, only the first five pages of the sample PDF
'' will be loaded, and the index will be generated for those five pages only.
Public Class WordIndex

    '' Font collection to hold the fonts we need:
    Private _fc As FontCollection = New FontCollection()
    '' Font family used throughout this sample (this is not case-sensitive):
    Const _fontFamily = "segoe ui"

    '' Main sample entry:
    Function CreatePDF(ByVal stream As Stream) As Integer
        '' Set up a font collection with the fonts we need:
        _fc.RegisterDirectory(Path.Combine("Resources", "Fonts"))

        '' Get the PDF to add index to:
        Dim tfile = Path.Combine("Resources", "PDFs", "CompleteJavaScriptBook.pdf")

        '' The list of words on which we will build the index:
        Dim words = _keywords.Distinct(StringComparer.InvariantCultureIgnoreCase).Where(Function(w_) Not String.IsNullOrEmpty(w_))

        '' Load the PDF and add the index:
        Using fs = New FileStream(tfile, FileMode.Open, FileAccess.Read)
            Dim doc = New GcPdfDocument()
            doc.Load(fs)
            ''
            Dim origPageCount = doc.Pages.Count
            '' Build and add the index:
            AddWordIndex(doc, words)
            '' Open document on the first index page by default
            '' (may not work in browser viewers, but works in Acrobat):
            doc.OpenAction = New DestinationFit(origPageCount)
            '' Done:
            doc.Save(stream)
            Return doc.Pages.Count
        End Using
    End Function

    '' The list of words to build the index on:
    Private ReadOnly _keywords() As String =
        {
            "JavaScript", "Framework", "MVC", "npm", "URL", "CDN", "HTML5", "CSS", "ES2015", "web",
            "Node.js", "API", "model", "view", "controller", "data management", "UI", "HTML",
            "API", "function", "var", "component", "design pattern", "React.js", "Angular", "AJAX",
            "DOM", "TypeScript", "ECMAScript", "CLI", "Wijmo", "CoffeeScript", "Elm",
            "plugin", "VueJS", "Knockout", "event", "AngularJS", "pure JS", "data binding", "OOP", "GrapeCity",
            "gauge", "JSX", "mobile", "desktop", "Vue", "template", "server-side", "client-side",
            "SPEC", "RAM", "ECMA"
        }

    '' Calling FindText() on a document Or a page builds text maps for each page on the fly.
    '' Reusing cached text maps speeds things up a lot.
    Private Function FindTextPages(ByVal maps As ITextMap(), ByVal tp As FindTextParams) As SortedSet(Of Integer)
        Dim finds = New SortedSet(Of Integer)
        Dim currPageIdx = -1
        For Each map In maps
            currPageIdx = map.Page.Index
            map.FindText(tp, Function(fp_) finds.Add(currPageIdx))
        Next
        Return finds
    End Function

    '' Adds a word index to the end of the passed document:
    Private Sub AddWordIndex(ByVal doc As GcPdfDocument, ByVal words As IEnumerable(Of String))
        Dim tStart = Util.TimeNow()

        '' Build text maps for all pages to speed up FindText() calls
        Dim textMaps(doc.Pages.Count - 1) As ITextMap
        For i = 0 To doc.Pages.Count - 1
            textMaps(i) = doc.Pages(i).GetTextMap()
        Next

        '' Words and page indices where they occur, sorted on words:
        Dim index = New SortedDictionary(Of String, List(Of Integer))()

        '' Here the main loop building the index is on key words.
        '' An alternative would be to loop over the pages.
        '' Depending on the relative sizes of the keyword dictionary vs
        '' the number of pages in the document, one or the other might be better,
        '' but this is beyond the scope of this sample.
        For Each word In words
            Dim wholeWord As Boolean = word.IndexOf(" "c) = -1
            Dim pgs = FindTextPages(textMaps, New FindTextParams(word, wholeWord, False))
            '' A very simplistic way of also finding plurals:
            If wholeWord AndAlso Not word.EndsWith("s") Then
                pgs.UnionWith(FindTextPages(textMaps, New FindTextParams(word + "s", wholeWord, False)))
            End If
            If (pgs.Any()) Then
                index.Add(word, pgs.ToList())
            End If
        Next

        '' Prepare to render the index. The whole index is built
        '' in a single TextLayout instance, set up to render it
        '' in two columns per page.
        '' The main rendering loop uses the TextLayout.SplitAndBalance method 
        '' imports the approach demonstrated in BalancedColumns sample.
        '' The complication here is that we need to associate a link to the
        '' relevant page with each page number rendered, see linkIndices below.
        '' Set up the TextLayout:
        Const margin = 72.0F
        Dim pageWidth = doc.PageSize.Width
        Dim pageHeight = doc.PageSize.Height
        Dim cW = pageWidth - margin * 2
        '' Caption (index letter) format:
        Dim tfCap = New TextFormat() With {
            .FontName = _fontFamily,
            .FontBold = True,
            .FontSize = 16,
            .LineGap = 24
        }
        '' Index word and pages format:
        Dim tfRun = New TextFormat() With {
            .FontName = _fontFamily,
            .FontSize = 10
        }
        '' Page headers/footers:
        Dim tfHdr = New TextFormat() With {
            .FontName = _fontFamily,
            .FontItalic = True,
            .FontSize = 10
        }
        '' FirstLineIndent = -18 sets up hanging indent:
        Dim tl = New TextLayout(72) With {
            .FontCollection = _fc,
            .FirstLineIndent = -18,
            .MaxWidth = pageWidth,
            .MaxHeight = pageHeight,
            .MarginLeft = margin,
            .MarginRight = margin,
            .MarginBottom = margin,
            .MarginTop = margin,
            .ColumnWidth = cW * 0.46F,
            .TextAlignment = TextAlignment.Leading,
            .ParagraphSpacing = 4,
            .LineGapBeforeFirstLine = False
        }

        '' The list of text runs created for page numbers:
        Dim pgnumRuns = New List(Of Tuple(Of TextRun, Integer))()
        '' This loop builds the index on the TextLayout, saving the text runs
        '' created for each page number rendered. Note that at this point 
        '' (prior to the PerformLayout(true) call) the text runs do not contain any info
        '' about their code points and render locations, so we can only save the text runs here.
        '' Later they will be used to add links to referenced pages in the PDF:
        Dim litera As Char = " "
        For Each kvp In index
            Dim word = kvp.Key
            Dim pageIndices = kvp.Value
            If Char.ToUpper(word(0)) <> litera Then
                litera = Char.ToUpper(word(0))
                tl.Append($"{litera}{ChrW(&H2029)}", tfCap)
            End If
            tl.Append(word, tfRun)
            tl.Append("  ", tfRun)
            For i = 0 To pageIndices.Count - 1
                Dim from_ = pageIndices(i)
                Dim tr = tl.Append((from_ + 1).ToString(), tfRun)
                pgnumRuns.Add(Tuple.Create(Of TextRun, Integer)(tr, from_))
                '' We merge sequential pages into "..-M":
                Dim k = i
                For j = i + 1 To pageIndices.Count - 1
                    If pageIndices(j) <> pageIndices(j - 1) + 1 Then
                        Exit For
                    End If
                    k = j
                Next
                If (k > i + 1) Then
                    tl.Append("-", tfRun)
                    Dim to_ = pageIndices(k)
                    tr = tl.Append((to_ + 1).ToString(), tfRun)
                    pgnumRuns.Add(Tuple.Create(Of TextRun, Integer)(tr, to_))
                    '' Fast forward:
                    i = k
                End If
                If (i < pageIndices.Count - 1) Then
                    tl.Append(", ", tfRun)
                Else
                    tl.AppendLine(tfRun)
                End If
            Next
        Next
        '' This calculates the glyphs and lays out the whole index.
        '' The tl.SplitAndBalance() call in the loop below does not require redoing the layout:
        tl.PerformLayout(True)

        ''
        '' Now we are ready to split and render the text layout, and also add links to page numbers.
        ''

        '' Split areas and options - see BalancedColumns for details:
        Dim psas() As PageSplitArea = {
            New PageSplitArea(tl) With {.MarginLeft = tl.MarginLeft + (cW * 0.54F)}
        }
        Dim tso = New TextSplitOptions(tl) With {
            .KeepParagraphLinesTogether = True
        }

        '' First original code point index in the current column:
        Dim cpiStart = 0
        '' Max+1 original code point index in the current column:
        Dim cpiEnd = 0
        '' Current index in pgnumRuns:
        Dim pgnumRunsIdx = 0

        '' Method to add links to actual pages over page numbers in the current column:
        Dim linkIndices As Action(Of TextLayout, Page) =
            Sub(tl_, page_)
                cpiEnd += tl_.CodePointCount
                While pgnumRunsIdx < pgnumRuns.Count
                    Dim run = pgnumRuns(pgnumRunsIdx)
                    Dim textRun = run.Item1
                    Dim cpi = textRun.CodePointIndex
                    If cpi >= cpiEnd Then
                        Exit While
                    End If
                    cpi -= cpiStart
                    Dim rects = tl_.GetTextRects(cpi, textRun.CodePointCount)
                    Debug.Assert(rects.Count > 0)
                    page_.Annotations.Add(New LinkAnnotation(rects(0).ToRectangleF(), New DestinationFit(run.Item2)))
                    pgnumRunsIdx += 1
                End While
                cpiStart += tl_.CodePointCount
            End Sub

        '' Split and render the index in 2 columns:
        Dim page = doc.Pages.Add()
        While True
            Dim g = Page.Graphics
            '' Add a simple page header:
            g.DrawString($"Index generated by DsPdf on {tStart:R}", tfHdr,
                New RectangleF(margin, 0, pageWidth - margin * 2, margin),
                TextAlignment.Center, ParagraphAlignment.Center, False)
            '' 'rest' will accept the text that did not fit on this page:
            Dim rest As TextLayout = Nothing
            Dim splitResult = tl.SplitAndBalance(psas, tso, rest)
            '' Render text:
            g.DrawTextLayout(tl, PointF.Empty)
            g.DrawTextLayout(psas(0).TextLayout, PointF.Empty)
            '' Add links from page numbers to pages:
            linkIndices(tl, page)
            linkIndices(psas(0).TextLayout, page)
            '' Are we done yet?
            If splitResult <> SplitResult.Split Then
                Exit While
            End If
            tl = rest
            page = doc.Pages.Add()
        End While
        '' Done:
    End Sub

    '' Creates a sample document with 100 pages of 'lorem ipsum':
    Private Function MakeDocumentToIndex() As String
        Const N = 100
        Dim tfile = Path.GetTempFileName()
        Using fsOut = New FileStream(tfile, FileMode.Open, FileAccess.ReadWrite)
            Dim tdoc = New GcPdfDocument()
            '' See StartEndDoc for details on StartDoc/EndDoc mode:
            tdoc.StartDoc(fsOut)
            '' Prep a TextLayout to hold/format the text:
            Dim tl = New TextLayout(72)
            tl.FontCollection = _fc
            tl.DefaultFormat.FontName = _fontFamily
            tl.DefaultFormat.FontSize = 12
            '' Use TextLayout to layout the whole page including margins:
            tl.MaxHeight = tdoc.PageSize.Height
            tl.MaxWidth = tdoc.PageSize.Width
            tl.MarginAll = 72
            tl.FirstLineIndent = 72 / 2
            '' Generate the document:
            For pageIdx = 0 To N - 1
                tl.Append(Util.LoremIpsum(1))
                tl.PerformLayout(True)
                tdoc.NewPage().Graphics.DrawTextLayout(tl, PointF.Empty)
                tl.Clear()
            Next
            tdoc.EndDoc()
        End Using
        Return tfile
    End Function
End Class