jeudi 1 janvier 2015

Merging runs in an openxml file/document

The goal is to reduce the number of runs in the paragraphs of a document. I made the choice not to use the openxml SDK. Indeed my goal is a templating engine. In my process the runs may contain xsl, that is xml. So I made the choice to stay on raw XML tools.

The operational (without the usings, except those revealing a dependency) code looks like:

namespace SandBox {
    class Program {
        static void Main(string[] args) {
            try {

                String fileName = @"somepath\somefile.docx";
                String destFile = "res.docx";
                
                Tuple<XPathNavigator, XmlNamespaceManager> xp = 
                    ZDocx.GetNavigatorAndManagerFromString(
                        ZDocx.GetDocxDocumentStringFromDocxFile(fileName));
                XPathNavigator xpn = xp.Item1;
                XmlNamespaceManager xnm = xp.Item2;

                XPathNodeIterator xni = xpn.Select("//w:p", xnm);
                while (xni.MoveNext()) {
                    //Merge all runs ignoring styles
                    //ZDocx.MergeRuns(xni.Current);
                    //Merge considering only Bold as a grouping condition
                    ZDocx.MergeRuns(xni.Current, new ByStylesNodesComparator  {
                        Settings = new ByStylesNodesComparatorSettings {
                            CheckBold = true
                    }});                    
                }

                System.IO.File.Copy(fileName, destFile, true);
                ZDocx.SetDocxDocumentStringToDocxFile(xpn, destFile);

            } catch (Exception ex) {
                Console.WriteLine(ex.Message);
            }
        }
    }
}

There are 3 main steps:

  • I fristly load the document.xml part the openxml file in an XPathNavigator
  • Then I process each paragraph.
  • Finally I inject the modified document.xml in a new openxml file.

First the cooking code:

using Ionic.Zip;

namespace SandBox {
    public class ZDocx {
        public static String nsW = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";

        public static String GetDocxDocumentStringFromDocxFile(String fileName) {
            String tf = System.IO.Path.GetTempFileName();
            StreamWriter sw = new StreamWriter(tf);
            using (ZipFile zip = ZipFile.Read(fileName)) {
                ZipEntry e = zip["word\\document.xml"];
                e.Extract(sw.BaseStream);                
            }
            try {
                sw.Close();
            } catch { }
            
            StreamReader sr = new StreamReader(tf);
            String st = sr.ReadToEnd();
            sr.Close();
            System.IO.File.Delete(tf);
            
            return st;
        }

        public static void SetDocxDocumentStringToDocxFile(XPathNavigator xdoc, String fileName) {            
            using (ZipFile zip = ZipFile.Read(fileName)) {
                using (MemoryStream ms = new MemoryStream()) {
                    XmlWriterSettings xws = new XmlWriterSettings {
                        Encoding = Encoding.UTF8
                    };
                    using (XmlWriter xw = XmlWriter.Create(ms, xws)) {
                        xw.WriteNode(xdoc, false);
                        xw.Flush();
                        ms.Position = 0;
                        zip.UpdateEntry("word\\document.xml", ms);
                        zip.Save();
                    }
                }
            }            
        }

        public static Tuple<XPathNavigator, XmlNamespaceManager> GetNavigatorAndManagerFromString(String st) {
            XmlDocument xml = new XmlDocument();
            StringReader stReader = new StringReader(st);
            XmlReader xReader = XmlReader.Create(stReader, new XmlReaderSettings() { 
                IgnoreWhitespace = false, 
                CloseInput = true });
            xml.Load(xReader);
            xReader.Close();
            XPathNavigator navXml = xml.CreateNavigator();
            XmlNamespaceManager manager = new XmlNamespaceManager(navXml.NameTable);
            manager.AddNamespace("w", nsW);
            manager.AddNamespace("xsl", "http://www.w3.org/1999/XSL/Transform");

            return new Tuple<XPathNavigator, XmlNamespaceManager>(navXml, manager);
        }

    }
}

Then the merging code:

namespace SandBox {
    public class ZDocx {
        public static void MergeRuns(XPathNavigator paragraph, INodeComparator areMergeable = null) {
            if (paragraph.LocalName != "p")
                throw new Exception("MergeRuns: paragraph is not a 'w:p'.");
            if (areMergeable == null)
                areMergeable = AlwaysTrueNodesComparator.GetInstance();

            XPathNavigator destRun = null;
            XPathNavigator lPara = paragraph.Clone();
            lPara.MoveToFirstChild();
            do {
                if (lPara.LocalName != "r")
                    continue;

                if (destRun == null) {
                    destRun = lPara.Clone();
                    continue;
                }

                if ( areMergeable.AreMergeable(destRun, lPara) ) {
                    XPathNavigator nL = lPara.Clone();
                    XPathNavigator nK = destRun.Clone();
                    nL.MoveToChild("t", nsW);
                    nK.MoveToChild("t", nsW);
                    nK.InnerXml += nL.InnerXml;                    
                    nL.MoveToParent();
                    nL.MoveToPrevious();
                    lPara.DeleteSelf();
                    lPara = nL;
                } else {
                    destRun = lPara.Clone();
                }

            } while (lPara.MoveToNext());
        }
    }
}
In the previous code, one key is the areMergeable parameter: this parameter allows to decide how the merging occurs. This parameter implements the following interface.
    public interface INodeComparator {
        Boolean AreMergeable(XPathNavigator xpn1, XPathNavigator xpn2);
    }
This interface may be implemented as in the following samples provided as an inspiration root:
namespace SandBox {
    public class AlwaysTrueNodesComparator : INodeComparator {
        private AlwaysTrueNodesComparator() {}

        private static AlwaysTrueNodesComparator _inst = new AlwaysTrueNodesComparator();
        public static AlwaysTrueNodesComparator GetInstance() {
            return _inst;
        }

        public Boolean AreMergeable(XPathNavigator xpn1, XPathNavigator xpn2) {
            return true;
        }
    }

    public class ByStylesNodesComparatorSettings {
        public Boolean CheckBold { get; set; }
        public Boolean CheckUnderlined { get; set; }
        public Boolean CheckItalic { get; set; }
        public Boolean CheckStriked { get; set; }
    }

    public class ByStylesNodesComparator : INodeComparator {
        public ByStylesNodesComparatorSettings Settings { get; set; }

        public Boolean AreMergeable(XPathNavigator xpn1, XPathNavigator xpn2) {
            XPathNavigator nav1 = xpn1.Clone();
            XPathNavigator nav2 = xpn2.Clone();

            Boolean nav1HasRPr = nav1.MoveToChild("rPr", ZDocx.nsW);
            Boolean nav2HasRPr = nav2.MoveToChild("rPr", ZDocx.nsW);

            Boolean b1, b2;

            if (Settings == null || Settings.CheckBold) {
                b1 = nav1HasRPr && nav1.SelectChildren("b", ZDocx.nsW).Count == 1;
                b2 = nav2HasRPr && nav2.SelectChildren("b", ZDocx.nsW).Count == 1;
                if (b1 != b2)
                    return false;
            }

            if (Settings == null || Settings.CheckUnderlined) {
                b1 = nav1HasRPr && nav1.SelectChildren("u", ZDocx.nsW).Count == 1;
                b2 = nav2HasRPr && nav2.SelectChildren("u", ZDocx.nsW).Count == 1;
                if (b1 != b2)
                    return false;
            }

            if (Settings == null || Settings.CheckItalic) {
                b1 = nav1HasRPr && nav1.SelectChildren("i", ZDocx.nsW).Count == 1;
                b2 = nav2HasRPr && nav2.SelectChildren("i", ZDocx.nsW).Count == 1;
                if (b1 != b2)
                    return false;
            }

            if (Settings == null || Settings.CheckStriked) {
                b1 = nav1HasRPr && nav1.SelectChildren("strike", ZDocx.nsW).Count == 1;
                b2 = nav2HasRPr && nav2.SelectChildren("strike", ZDocx.nsW).Count == 1;
                if (b1 != b2)
                    return false;
            }

            return true;
        }
    }
}

Take care to clone the navigators in the AreMergeable method to not surprise the caller.

Using AlwaysTrueNodesComparator reduces all paragraphs to one single run with the styles (or not) of the first run of the reduced paragraph.

Using ByStylesNodesComparator allows to merge runs according to some part of their styles. In the implemented class the handled style are Bold, Underline, Strike and Italic. Be careful that only basis underlining is handled. The underline style is not handled.

Aucun commentaire:

Enregistrer un commentaire