FileProcessingService.cs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. 
  2. using Aspose.Words;
  3. using Aspose.Words.Tables;
  4. using Aspose.Words.Fields;
  5. using System.Text;
  6. namespace OASystem.API.OAMethodLib.FileProcessing
  7. {
  8. /// <summary>
  9. /// 文件处理服务,使用Aspose.Words处理Word文档
  10. /// </summary>
  11. public class FileProcessingService : IFileProcessingService
  12. {
  13. private readonly ILogger<FileProcessingService> _logger;
  14. private readonly IConfiguration _configuration;
  15. public FileProcessingService(
  16. ILogger<FileProcessingService> logger,
  17. IConfiguration configuration)
  18. {
  19. _logger = logger;
  20. _configuration = configuration;
  21. InitializeAsposeLicense();
  22. }
  23. /// <summary>
  24. /// 初始化Aspose.Words许可证
  25. /// </summary>
  26. private void InitializeAsposeLicense()
  27. {
  28. try
  29. {
  30. var licensePath = _configuration["Aspose:LicensePath"];
  31. if (!string.IsNullOrEmpty(licensePath) && System.IO.File.Exists(licensePath))
  32. {
  33. var license = new License();
  34. license.SetLicense(licensePath);
  35. _logger.LogInformation("Aspose.Words 15.12.0 许可证已成功加载");
  36. }
  37. else
  38. {
  39. _logger.LogWarning("Aspose.Words许可证文件未找到,将使用评估模式");
  40. }
  41. }
  42. catch (Exception ex)
  43. {
  44. _logger.LogError(ex, "Aspose.Words许可证初始化失败");
  45. }
  46. }
  47. /// <summary>
  48. /// 处理Word文档并提取结构化信息
  49. /// </summary>
  50. public async Task<ProcessingResult> ProcessWordDocumentAsync(IFormFile file)
  51. {
  52. var stopwatch = System.Diagnostics.Stopwatch.StartNew();
  53. try
  54. {
  55. _logger.LogInformation("开始处理Word文档: {FileName} ({Size} bytes)",
  56. file.FileName, file.Length);
  57. if (!IsSupportedWordFormat(file.FileName))
  58. {
  59. return new ProcessingResult
  60. {
  61. Success = false,
  62. ErrorMessage = $"不支持的文件格式: {Path.GetExtension(file.FileName)}",
  63. FileSize = file.Length,
  64. FileType = file.ContentType
  65. };
  66. }
  67. using var stream = new MemoryStream();
  68. await file.CopyToAsync(stream);
  69. stream.Position = 0;
  70. var documentInfo = await ExtractDocumentInfoAsync(stream, file.FileName);
  71. stopwatch.Stop();
  72. _logger.LogInformation("Word文档处理完成: {FileName}, 耗时: {ElapsedMs}ms",
  73. file.FileName, stopwatch.ElapsedMilliseconds);
  74. return new ProcessingResult
  75. {
  76. Success = true,
  77. Data = documentInfo,
  78. FileSize = file.Length,
  79. FileType = file.ContentType
  80. };
  81. }
  82. catch (Exception ex)
  83. {
  84. stopwatch.Stop();
  85. _logger.LogError(ex, "Word文档处理失败: {FileName}", file.FileName);
  86. return new ProcessingResult
  87. {
  88. Success = false,
  89. ErrorMessage = $"处理失败: {ex.Message}",
  90. FileSize = file.Length,
  91. FileType = file.ContentType
  92. };
  93. }
  94. }
  95. /// <summary>
  96. /// 批量处理Word文档
  97. /// </summary>
  98. public async Task<List<ProcessingResult>> ProcessWordDocumentsAsync(List<IFormFile> files)
  99. {
  100. var results = new List<ProcessingResult>();
  101. var tasks = files.Select(ProcessWordDocumentAsync).ToList();
  102. var batchResults = await Task.WhenAll(tasks);
  103. results.AddRange(batchResults);
  104. return results;
  105. }
  106. /// <summary>
  107. /// 从流中提取文档信息
  108. /// </summary>
  109. public async Task<WordDocumentInfo> ExtractDocumentInfoAsync(Stream stream, string fileName)
  110. {
  111. var stopwatch = System.Diagnostics.Stopwatch.StartNew();
  112. try
  113. {
  114. // 加载Word文档
  115. var doc = new Document(stream);
  116. var documentInfo = new WordDocumentInfo
  117. {
  118. Title = ExtractDocumentTitle(doc),
  119. Content = ExtractTextContent(doc),
  120. Metadata = ExtractMetadata(doc),
  121. Tables = ExtractTables(doc),
  122. FormFields = ExtractFormFields(doc),
  123. Sections = ExtractSections(doc),
  124. ImagesCount = CountImages(doc)
  125. };
  126. stopwatch.Stop();
  127. documentInfo.ProcessingTimeMs = stopwatch.ElapsedMilliseconds;
  128. return documentInfo;
  129. }
  130. catch (Exception ex)
  131. {
  132. _logger.LogError(ex, "文档信息提取失败: {FileName}", fileName);
  133. throw;
  134. }
  135. }
  136. /// <summary>
  137. /// 提取文档标题
  138. /// </summary>
  139. private string ExtractDocumentTitle(Document doc)
  140. {
  141. try
  142. {
  143. // 首先尝试从文档属性获取标题
  144. var title = doc.BuiltInDocumentProperties.Title;
  145. if (!string.IsNullOrEmpty(title))
  146. return title;
  147. // 如果没有标题,尝试从第一个段落提取
  148. foreach (Aspose.Words.Paragraph paragraph in doc.GetChildNodes(NodeType.Paragraph, true))
  149. {
  150. var text = paragraph.GetText().Trim();
  151. if (!string.IsNullOrEmpty(text) && text.Length < 100) // 假设标题不会太长
  152. return text;
  153. }
  154. return "未命名文档";
  155. }
  156. catch (Exception ex)
  157. {
  158. _logger.LogWarning(ex, "提取文档标题失败");
  159. return "未命名文档";
  160. }
  161. }
  162. /// <summary>
  163. /// 提取文本内容
  164. /// </summary>
  165. private string ExtractTextContent(Document doc)
  166. {
  167. try
  168. {
  169. // 使用Aspose.Words的GetText方法提取纯文本
  170. return doc.GetText();
  171. }
  172. catch (Exception ex)
  173. {
  174. _logger.LogWarning(ex, "提取文本内容失败");
  175. return string.Empty;
  176. }
  177. }
  178. /// <summary>
  179. /// 提取文档元数据
  180. /// </summary>
  181. private DocumentMetadata ExtractMetadata(Document doc)
  182. {
  183. try
  184. {
  185. var props = doc.BuiltInDocumentProperties;
  186. return new DocumentMetadata
  187. {
  188. Author = props.Author ?? string.Empty,
  189. Company = props.Company ?? string.Empty,
  190. CreatedTime = props.CreatedTime,
  191. LastSavedTime = props.LastSavedTime,
  192. PageCount = doc.PageCount,
  193. WordCount = props.Words,
  194. CharacterCount = props.Characters,
  195. Subject = props.Subject ?? string.Empty,
  196. Keywords = props.Keywords ?? string.Empty
  197. };
  198. }
  199. catch (Exception ex)
  200. {
  201. _logger.LogWarning(ex, "提取元数据失败");
  202. return new DocumentMetadata();
  203. }
  204. }
  205. /// <summary>
  206. /// 提取表格数据
  207. /// </summary>
  208. private List<DocumentTable> ExtractTables(Document doc)
  209. {
  210. var tables = new List<DocumentTable>();
  211. try
  212. {
  213. int tableIndex = 1;
  214. foreach (Aspose.Words.Tables.Table table in doc.GetChildNodes(NodeType.Table, true))
  215. {
  216. var docTable = new DocumentTable
  217. {
  218. TableName = $"表格_{tableIndex}"
  219. };
  220. // 提取表头(假设第一行是表头)
  221. if (table.Rows.Count > 0)
  222. {
  223. var firstRow = table.FirstRow;
  224. foreach (Cell cell in firstRow.Cells)
  225. {
  226. docTable.Headers.Add(cell.GetText().Trim());
  227. }
  228. }
  229. // 提取所有行数据
  230. foreach (Row row in table.Rows)
  231. {
  232. var rowData = new List<string>();
  233. foreach (Cell cell in row.Cells)
  234. {
  235. rowData.Add(cell.GetText().Trim());
  236. }
  237. docTable.Rows.Add(rowData);
  238. }
  239. tables.Add(docTable);
  240. tableIndex++;
  241. }
  242. }
  243. catch (Exception ex)
  244. {
  245. _logger.LogWarning(ex, "提取表格数据失败");
  246. }
  247. return tables;
  248. }
  249. /// <summary>
  250. /// 提取表单字段
  251. /// </summary>
  252. private List<FormField> ExtractFormFields(Document doc)
  253. {
  254. var formFields = new List<FormField>();
  255. try
  256. {
  257. foreach (FormField formField in doc.Range.FormFields)
  258. {
  259. formFields.Add(new FormField
  260. {
  261. Name = formField.Name,
  262. Type = formField.Type.ToString()
  263. //,
  264. //Value = formField.Result ?? string.Empty,
  265. //IsChecked = formField.Checked
  266. });
  267. }
  268. }
  269. catch (Exception ex)
  270. {
  271. _logger.LogWarning(ex, "提取表单字段失败");
  272. }
  273. return formFields;
  274. }
  275. /// <summary>
  276. /// 提取章节信息
  277. /// </summary>
  278. private List<DocumentSection> ExtractSections(Document doc)
  279. {
  280. var sections = new List<DocumentSection>();
  281. try
  282. {
  283. int sectionIndex = 1;
  284. foreach (Section section in doc.Sections)
  285. {
  286. var docSection = new DocumentSection
  287. {
  288. SectionNumber = sectionIndex,
  289. Content = section.GetText(),
  290. ParagraphsCount = section.Body.Paragraphs.Count,
  291. TablesCount = section.Body.Tables.Count
  292. };
  293. sections.Add(docSection);
  294. sectionIndex++;
  295. }
  296. }
  297. catch (Exception ex)
  298. {
  299. _logger.LogWarning(ex, "提取章节信息失败");
  300. }
  301. return sections;
  302. }
  303. /// <summary>
  304. /// 统计图片数量
  305. /// </summary>
  306. private int CountImages(Document doc)
  307. {
  308. try
  309. {
  310. int imageCount = 0;
  311. foreach (Aspose.Words.Drawing.Shape shape in doc.GetChildNodes(NodeType.Shape, true))
  312. {
  313. if (shape.HasImage)
  314. {
  315. imageCount++;
  316. }
  317. }
  318. return imageCount;
  319. }
  320. catch (Exception ex)
  321. {
  322. _logger.LogWarning(ex, "统计图片数量失败");
  323. return 0;
  324. }
  325. }
  326. /// <summary>
  327. /// 检查文件是否为支持的Word格式
  328. /// </summary>
  329. public bool IsSupportedWordFormat(string fileName)
  330. {
  331. var extension = Path.GetExtension(fileName).ToLower();
  332. return extension switch
  333. {
  334. ".doc" => true,
  335. ".docx" => true,
  336. ".dot" => true,
  337. ".dotx" => true,
  338. ".docm" => true,
  339. ".dotm" => true,
  340. _ => false
  341. };
  342. }
  343. }
  344. }