using Aspose.Words; using Aspose.Words.Tables; using Aspose.Words.Fields; using System.Text; namespace OASystem.API.OAMethodLib.FileProcessing { /// /// 文件处理服务,使用Aspose.Words处理Word文档 /// public class FileProcessingService : IFileProcessingService { private readonly ILogger _logger; private readonly IConfiguration _configuration; public FileProcessingService( ILogger logger, IConfiguration configuration) { _logger = logger; _configuration = configuration; InitializeAsposeLicense(); } /// /// 初始化Aspose.Words许可证 /// private void InitializeAsposeLicense() { try { var licensePath = _configuration["Aspose:LicensePath"]; if (!string.IsNullOrEmpty(licensePath) && System.IO.File.Exists(licensePath)) { var license = new License(); license.SetLicense(licensePath); _logger.LogInformation("Aspose.Words 15.12.0 许可证已成功加载"); } else { _logger.LogWarning("Aspose.Words许可证文件未找到,将使用评估模式"); } } catch (Exception ex) { _logger.LogError(ex, "Aspose.Words许可证初始化失败"); } } /// /// 处理Word文档并提取结构化信息 /// public async Task ProcessWordDocumentAsync(IFormFile file) { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); try { _logger.LogInformation("开始处理Word文档: {FileName} ({Size} bytes)", file.FileName, file.Length); if (!IsSupportedWordFormat(file.FileName)) { return new ProcessingResult { Success = false, ErrorMessage = $"不支持的文件格式: {Path.GetExtension(file.FileName)}", FileSize = file.Length, FileType = file.ContentType }; } using var stream = new MemoryStream(); await file.CopyToAsync(stream); stream.Position = 0; var documentInfo = await ExtractDocumentInfoAsync(stream, file.FileName); stopwatch.Stop(); _logger.LogInformation("Word文档处理完成: {FileName}, 耗时: {ElapsedMs}ms", file.FileName, stopwatch.ElapsedMilliseconds); return new ProcessingResult { Success = true, Data = documentInfo, FileSize = file.Length, FileType = file.ContentType }; } catch (Exception ex) { stopwatch.Stop(); _logger.LogError(ex, "Word文档处理失败: {FileName}", file.FileName); return new ProcessingResult { Success = false, ErrorMessage = $"处理失败: {ex.Message}", FileSize = file.Length, FileType = file.ContentType }; } } /// /// 批量处理Word文档 /// public async Task> ProcessWordDocumentsAsync(List files) { var results = new List(); var tasks = files.Select(ProcessWordDocumentAsync).ToList(); var batchResults = await Task.WhenAll(tasks); results.AddRange(batchResults); return results; } /// /// 从流中提取文档信息 /// public async Task ExtractDocumentInfoAsync(Stream stream, string fileName) { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); try { // 加载Word文档 var doc = new Document(stream); var documentInfo = new WordDocumentInfo { Title = ExtractDocumentTitle(doc), Content = ExtractTextContent(doc), Metadata = ExtractMetadata(doc), Tables = ExtractTables(doc), FormFields = ExtractFormFields(doc), Sections = ExtractSections(doc), ImagesCount = CountImages(doc) }; stopwatch.Stop(); documentInfo.ProcessingTimeMs = stopwatch.ElapsedMilliseconds; return documentInfo; } catch (Exception ex) { _logger.LogError(ex, "文档信息提取失败: {FileName}", fileName); throw; } } /// /// 提取文档标题 /// private string ExtractDocumentTitle(Document doc) { try { // 首先尝试从文档属性获取标题 var title = doc.BuiltInDocumentProperties.Title; if (!string.IsNullOrEmpty(title)) return title; // 如果没有标题,尝试从第一个段落提取 foreach (Aspose.Words.Paragraph paragraph in doc.GetChildNodes(NodeType.Paragraph, true)) { var text = paragraph.GetText().Trim(); if (!string.IsNullOrEmpty(text) && text.Length < 100) // 假设标题不会太长 return text; } return "未命名文档"; } catch (Exception ex) { _logger.LogWarning(ex, "提取文档标题失败"); return "未命名文档"; } } /// /// 提取文本内容 /// private string ExtractTextContent(Document doc) { try { // 使用Aspose.Words的GetText方法提取纯文本 return doc.GetText(); } catch (Exception ex) { _logger.LogWarning(ex, "提取文本内容失败"); return string.Empty; } } /// /// 提取文档元数据 /// private DocumentMetadata ExtractMetadata(Document doc) { try { var props = doc.BuiltInDocumentProperties; return new DocumentMetadata { Author = props.Author ?? string.Empty, Company = props.Company ?? string.Empty, CreatedTime = props.CreatedTime, LastSavedTime = props.LastSavedTime, PageCount = doc.PageCount, WordCount = props.Words, CharacterCount = props.Characters, Subject = props.Subject ?? string.Empty, Keywords = props.Keywords ?? string.Empty }; } catch (Exception ex) { _logger.LogWarning(ex, "提取元数据失败"); return new DocumentMetadata(); } } /// /// 提取表格数据 /// private List ExtractTables(Document doc) { var tables = new List(); try { int tableIndex = 1; foreach (Aspose.Words.Tables.Table table in doc.GetChildNodes(NodeType.Table, true)) { var docTable = new DocumentTable { TableName = $"表格_{tableIndex}" }; // 提取表头(假设第一行是表头) if (table.Rows.Count > 0) { var firstRow = table.FirstRow; foreach (Cell cell in firstRow.Cells) { docTable.Headers.Add(cell.GetText().Trim()); } } // 提取所有行数据 foreach (Row row in table.Rows) { var rowData = new List(); foreach (Cell cell in row.Cells) { rowData.Add(cell.GetText().Trim()); } docTable.Rows.Add(rowData); } tables.Add(docTable); tableIndex++; } } catch (Exception ex) { _logger.LogWarning(ex, "提取表格数据失败"); } return tables; } /// /// 提取表单字段 /// private List ExtractFormFields(Document doc) { var formFields = new List(); try { foreach (FormField formField in doc.Range.FormFields) { formFields.Add(new FormField { Name = formField.Name, Type = formField.Type.ToString() //, //Value = formField.Result ?? string.Empty, //IsChecked = formField.Checked }); } } catch (Exception ex) { _logger.LogWarning(ex, "提取表单字段失败"); } return formFields; } /// /// 提取章节信息 /// private List ExtractSections(Document doc) { var sections = new List(); try { int sectionIndex = 1; foreach (Section section in doc.Sections) { var docSection = new DocumentSection { SectionNumber = sectionIndex, Content = section.GetText(), ParagraphsCount = section.Body.Paragraphs.Count, TablesCount = section.Body.Tables.Count }; sections.Add(docSection); sectionIndex++; } } catch (Exception ex) { _logger.LogWarning(ex, "提取章节信息失败"); } return sections; } /// /// 统计图片数量 /// private int CountImages(Document doc) { try { int imageCount = 0; foreach (Aspose.Words.Drawing.Shape shape in doc.GetChildNodes(NodeType.Shape, true)) { if (shape.HasImage) { imageCount++; } } return imageCount; } catch (Exception ex) { _logger.LogWarning(ex, "统计图片数量失败"); return 0; } } /// /// 检查文件是否为支持的Word格式 /// public bool IsSupportedWordFormat(string fileName) { var extension = Path.GetExtension(fileName).ToLower(); return extension switch { ".doc" => true, ".docx" => true, ".dot" => true, ".dotx" => true, ".docm" => true, ".dotm" => true, _ => false }; } } }