业务背景

  • 想在生成的文档中插入公式,丰富文档格式,提高专业度。但大模型输出的公式中,LaTeX 是最稳定的生成格式,但是也不排除生成错误的情况,导致 Word 无法正确渲染。因此,本文需要解决以下两个问题:
    1. LaTeX 公式转换为 MathML,然后转换为 OMML,并写入 Word。 目前,LaTeXMathML 的方案较为成熟,因此本文重点探讨 MathMLOMML 的实现。
    2. 校验最终插入 Word 文档的 OMML 是否可以正常渲染。

方案实现

  • 经过我的一番搜索,在stackoverflow上找到了一个解决方案,微软提供了 XSLT 样式表,用于使用 XSLT 将 OMML 转换为 MathML.
  • 整体公式处理链路如下
    1. 大模型生成LaTeX格式的公式,随后转为MathML
    2. 调用服务将MathML转为OOML,并写入Word文档
    3. 使用Open-Xml SDK 校验Word文档格式是否有误。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
namespace OpenXmlValidatorService
{
public class Program
{
public static void Main(string[] args)
{
CreateHostBuilder(args).Build().Run();
}

public static IHostBuilder CreateHostBuilder(string[] args) =>
Host.CreateDefaultBuilder(args)
.ConfigureWebHostDefaults(webBuilder =>
{
webBuilder.UseStartup<Startup>();
})
.ConfigureLogging(logging =>
{
logging.ClearProviders(); // 清除默认的日志提供者
logging.AddConsole(options =>
{
options.TimestampFormat = "yyyy-MM-dd HH:mm:ss "; // 设置时间戳格式
});
});

}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
namespace OpenXmlValidatorService
{
public class Startup
{
public void ConfigureServices(IServiceCollection services)
{
services.AddControllers();
}

public void Configure(IApplicationBuilder app, IWebHostEnvironment env)
{
if (env.IsDevelopment())
{
app.UseDeveloperExceptionPage();
}

app.UseRouting();

app.UseEndpoints(endpoints =>
{
endpoints.MapControllers();
});
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
using Microsoft.AspNetCore.Mvc;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Validation;
using DocumentFormat.OpenXml.Wordprocessing;
using System.Xml;
using System.Xml.Xsl;
using System.Text;
using System.IO;
using System.Xml.Linq;


public class ConvertRequest
{
public List<string> MathmlList { get; set; }
public string OutputFilePath { get; set; }
}

namespace OpenXmlValidatorService.Controllers
{
[Route("api/[controller]")]
[ApiController]
public class ValidatorController : ControllerBase
{
private readonly ILogger<ValidatorController> _logger;
private readonly DateTime _startTime;

public ValidatorController(ILogger<ValidatorController> logger)
{
_logger = logger;
_startTime = DateTime.Now;
}

[HttpPost("validate")]
public async Task<IActionResult> ValidateDocumentFromMathML([FromBody] string mathml)
{

_logger.LogInformation($"接收到参数:{mathml}");
try
{
string outputDirectory = "/app/mathml_doc/";
string fileName = Guid.NewGuid().ToString() + ".docx";
string tempFilePath = Path.Combine(outputDirectory, fileName);
ConvertToDocx(mathml, tempFilePath);

_logger.LogInformation("文档生成成功,开始验证:{FilePath}", tempFilePath);

// 使用 OpenXmlValidator 验证文档
using (var doc = WordprocessingDocument.Open(tempFilePath, false))
{
OpenXmlValidator validator = new OpenXmlValidator();
var errors = validator.Validate(doc);

if (errors.Any())
{
_logger.LogWarning("文档验证发现 {ErrorCount} 个错误", errors.Count());
foreach (var error in errors)
{
_logger.LogDebug("错误描述: {Description}, 节点路径: {Node}, 部件: {Part}",
error.Description, error.Node, error.Part?.Uri);

if (error.Part != null && error.Part.Uri.ToString() == "/word/document.xml")
{
if (error.Description.Contains("math") || error.Description.Contains("Math"))
{
_logger.LogError("检测到公式渲染问题:{Description}", error.Description);
return Ok(new
{
code = 400,
message = tempFilePath
});
}
}
}
}
_logger.LogInformation("文档验证通过:{FilePath}", tempFilePath);
return Ok(new { code = 200, message = "文档符合 Open XML 规范" });
}
}
catch (Exception ex)
{
_logger.LogError(ex, "验证时发生异常:{Message}", ex.Message);
return StatusCode(StatusCodes.Status500InternalServerError, new
{
code = 500,
message = $"验证时发生异常: {ex.Message}"
});
}
}


[HttpGet("health")]
public IActionResult GetHeartbeat()
{
var currentTime = DateTime.Now;
var uptime = currentTime - _startTime;

var result = new
{
serviceName = "快降重 Open XML 文档校验 服务",
statusCode = 200,
timestamp = currentTime.ToString("yyyy-MM-dd HH:mm:ss"),
uptime = $"{uptime.Hours}h {uptime.Minutes}m",
version = "1.0.0"
};

_logger.LogInformation($"心跳正常:{result}");
return Ok(result);
}

[HttpPost("convertMathml")]
public IActionResult ConvertToDocx([FromBody] ConvertRequest request)
{
if (request == null || request.MathmlList == null || request.MathmlList.Count == 0)
{
return Ok(new { code = 500, message = "请求体中缺少 MathML 列表或数据为空。" });
}

if (string.IsNullOrWhiteSpace(request.OutputFilePath))
{
return Ok(new { code = 500, message = "请求体中缺少输出文件路径。" });
}

try
{
// 调用已有方法生成文档
ConvertToDocxByList(request.MathmlList, request.OutputFilePath);
return Ok(new { code = 200, message = "文档生成成功" });
}

catch (System.Exception ex)
{
return Ok(new { code = 500, message = $"文档生成失败:{ex.Message}" });
}
}

public static void ConvertToDocx(string mathml, string outputFilePath)
{
// XSLT 转换 MathML 到 OfficeMath
string xsltPath = "./MML2OMML.XSL"; // 确保路径正确
Console.WriteLine("xsltPath: " + xsltPath);
Console.WriteLine("currentPath: " + Directory.GetCurrentDirectory());
using (XmlReader reader = XmlReader.Create(new StringReader(mathml)))
{
XslCompiledTransform xslTransform = new XslCompiledTransform();

// 加载 XSLT 转换文件
if (System.IO.File.Exists(xsltPath))
{
xslTransform.Load(xsltPath);
}
else
{
Console.WriteLine("XSLT 文件未找到,请确保文件路径正确!" + Directory.GetCurrentDirectory());
return;
}

// 使用 MemoryStream 存储转换后的内容
using (MemoryStream ms = new MemoryStream())
{
XmlWriterSettings settings = new XmlWriterSettings
{
ConformanceLevel = ConformanceLevel.Fragment,
OmitXmlDeclaration = true
};

using (XmlWriter xw = XmlWriter.Create(ms, settings))
{
// 执行转换
xslTransform.Transform(reader, xw);
ms.Seek(0, SeekOrigin.Begin);

// 从内存流读取转换后的内容
StreamReader sr = new StreamReader(ms, Encoding.UTF8);
string officeML = sr.ReadToEnd();

// 创建 Word 文档并插入 OfficeMath
using (WordprocessingDocument wordDoc = WordprocessingDocument.Create(outputFilePath, DocumentFormat.OpenXml.WordprocessingDocumentType.Document))
{
var mainPart = wordDoc.AddMainDocumentPart();
mainPart.Document = new Document(new Body());

// 创建 OfficeMath 对象
// Console.WriteLine(om.InnerXml);
// 将公式添加到文档中的段落
DocumentFormat.OpenXml.Wordprocessing.Paragraph paragraph = new DocumentFormat.OpenXml.Wordprocessing.Paragraph();
DocumentFormat.OpenXml.Wordprocessing.Run run = new DocumentFormat.OpenXml.Wordprocessing.Run();
Console.WriteLine("OriginInnerXML: " + officeML);
officeML = officeML.Replace("<m:e />", "<m:e><m:r><w:rPr><w:rFonts w:ascii='Cambria Math' w:hAnsi='Cambria Math' /></w:rPr><m:t xml:space='preserve'> </m:t></m:r></m:e>");
paragraph.InnerXml = officeML;
Console.WriteLine("InnerXML: " + officeML);

DocumentFormat.OpenXml.Math.OfficeMath om = (DocumentFormat.OpenXml.Math.OfficeMath)paragraph.GetFirstChild<DocumentFormat.OpenXml.Math.OfficeMath>().Clone();
DocumentFormat.OpenXml.Wordprocessing.Paragraph paragraph1 = new DocumentFormat.OpenXml.Wordprocessing.Paragraph();

paragraph1.Append(om);

// 将段落添加到文档正文
mainPart.Document.Body.Append(paragraph1);
mainPart.Document.Save();
}

Console.WriteLine("Word 文档生成成功!");
}
}
}
}

public static void ConvertToDocxByList(List<string> mathmlList, string outputFilePath)
{

string directoryPath = Path.GetDirectoryName(outputFilePath);
if (!Directory.Exists(directoryPath))
{
Console.WriteLine($"目录不存在,正在创建目录:{directoryPath}");
Directory.CreateDirectory(directoryPath);
}

// XSLT 转换 MathML 到 OfficeMath
string xsltPath = "./MML2OMML.XSL"; // 确保路径正确
Console.WriteLine("xsltPath: " + xsltPath);
Console.WriteLine("currentPath: " + Directory.GetCurrentDirectory());

if (!System.IO.File.Exists(xsltPath))
{
Console.WriteLine("XSLT 文件未找到,请确保文件路径正确!" + Directory.GetCurrentDirectory());
return;
}

// 加载 XSLT 转换文件
XslCompiledTransform xslTransform = new XslCompiledTransform();
xslTransform.Load(xsltPath);

// 创建 Word 文档
using (WordprocessingDocument wordDoc = WordprocessingDocument.Create(outputFilePath, DocumentFormat.OpenXml.WordprocessingDocumentType.Document))
{
var mainPart = wordDoc.AddMainDocumentPart();
mainPart.Document = new Document(new Body());

foreach (string mathml in mathmlList)
{
// 将每个 MathML 转换为 OfficeMath
using (XmlReader reader = XmlReader.Create(new StringReader(mathml)))
using (MemoryStream ms = new MemoryStream())
{
XmlWriterSettings settings = new XmlWriterSettings
{
ConformanceLevel = ConformanceLevel.Fragment,
OmitXmlDeclaration = true
};

using (XmlWriter xw = XmlWriter.Create(ms, settings))
{
// 执行转换
xslTransform.Transform(reader, xw);
ms.Seek(0, SeekOrigin.Begin);

// 从内存流读取转换后的内容
using (StreamReader sr = new StreamReader(ms, Encoding.UTF8))
{
string officeML = sr.ReadToEnd();

// 创建 OfficeMath 对象
Console.WriteLine("OriginInnerXML: " + officeML);
DocumentFormat.OpenXml.Wordprocessing.Paragraph tempParagraph = new DocumentFormat.OpenXml.Wordprocessing.Paragraph();
officeML = AddFontStyle(officeML);
officeML = officeML.Replace("<m:e />", "<m:e><m:r><w:rPr><w:rFonts w:ascii='Cambria Math' w:hAnsi='Cambria Math' /></w:rPr><m:t xml:space='preserve'> </m:t></m:r></m:e>");
tempParagraph.InnerXml = officeML;
Console.WriteLine("InnerXML: " + officeML);
// 获取转换后的 OfficeMath
DocumentFormat.OpenXml.Math.OfficeMath om = tempParagraph.GetFirstChild<DocumentFormat.OpenXml.Math.OfficeMath>();

if (om != null)
{
DocumentFormat.OpenXml.Wordprocessing.Paragraph paragraph = new DocumentFormat.OpenXml.Wordprocessing.Paragraph();
paragraph.Append(om.CloneNode(true));

// 将段落添加到文档正文
mainPart.Document.Body.Append(paragraph);
}
else
{
Console.WriteLine("OfficeMath 转换失败,跳过该公式。");
}
}
}
}
}

// 保存文档
mainPart.Document.Save();
}

Console.WriteLine("Word 文档生成成功!");
}

public static string AddFontStyle(string inputOfficeML)
{
Console.WriteLine("接收到的inputOfficeML: " + inputOfficeML);
XDocument xmlDoc = XDocument.Parse(inputOfficeML);
XNamespace m = "http://schemas.openxmlformats.org/officeDocument/2006/math";
XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";

if (xmlDoc.Root != null && xmlDoc.Root.Name.Namespace != w)
{
xmlDoc.Root.Add(new XAttribute(XNamespace.Xmlns + "w", w.NamespaceName));
}

var rPrElements = xmlDoc.Descendants(m + "rPr").Where(rPr => !rPr.HasElements);

foreach (var rPr in rPrElements.ToList())
{
rPr.ReplaceWith(new XElement(m + "rPr",
new XElement(w + "rFonts",
new XAttribute(w + "ascii", "Cambria Math"),
new XAttribute(w + "hAnsi", "Cambria Math"))));
}

return xmlDoc.ToString();
}
}
}

公式生成链路

  • 该方案在Mac版WPS会存在兼容性问题,因WPS默认字体设置错误导致公式渲染异常,但Word底层XML结构中的公式节点字体设置是正确的。
    • 解决方案:在Windows系统上安装Office Word,使用命令行方式将文件另存一份后,再用Mac版WPS打开,公式即可正常显示。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
param (
[string]$inputPath,
[string]$outputPath
)

# 创建 Word 应用程序对象
$word = New-Object -ComObject Word.Application

# 打开已有的 Word 文档
$doc = $word.Documents.Open($inputPath)

# 另存为新的文件
$doc.SaveAs([ref]$outputPath)

# 关闭文档并退出 Word
$doc.Close()
$word.Quit()
  • 简单封装一个接口,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from flask import Flask, request, send_file
import os
import subprocess

app = Flask(__name__)

# 上传目录和处理后的文件目录
UPLOAD_FOLDER = "uploads"
OUTPUT_FOLDER = "outputs"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

@app.route('/convert', methods=['POST'])
def convert():
# 接收上传的文件
if 'file' not in request.files:
return {"error": "No file uploaded"}, 400

uploaded_file = request.files['file']
if uploaded_file.filename == '':
return {"error": "No file selected"}, 400

# 保存上传的文件
input_path = os.path.join(UPLOAD_FOLDER, uploaded_file.filename)
uploaded_file.save(input_path)

# 生成输出路径
output_path = os.path.join(OUTPUT_FOLDER, f"converted_{uploaded_file.filename}")

# 调用 PowerShell 脚本进行处理
try:
subprocess.run([
"powershell", "-ExecutionPolicy", "Bypass", "-File", "save_as.ps1",
"-inputPath", input_path,
"-outputPath", output_path
], check=True)
except subprocess.CalledProcessError as e:
return {"error": f"PowerShell script failed: {str(e)}"}, 500

# 返回处理后的文件
return send_file(output_path, as_attachment=True)

if __name__ == "__main__":
app.run(debug=True)

  • 生成公式的方案就是上述内容,但是更繁琐的是将生成的公式插入到docx文档中的正确位置,以及行内公式的展示,本文就不介绍了。