123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386 |
-
- using Aspose.Words;
- using Aspose.Words.Tables;
- using Aspose.Words.Fields;
- using System.Text;
- namespace OASystem.API.OAMethodLib.FileProcessing
- {
- /// <summary>
- /// 文件处理服务,使用Aspose.Words处理Word文档
- /// </summary>
- public class FileProcessingService : IFileProcessingService
- {
- private readonly ILogger<FileProcessingService> _logger;
- private readonly IConfiguration _configuration;
- public FileProcessingService(
- ILogger<FileProcessingService> logger,
- IConfiguration configuration)
- {
- _logger = logger;
- _configuration = configuration;
- InitializeAsposeLicense();
- }
- /// <summary>
- /// 初始化Aspose.Words许可证
- /// </summary>
- private void InitializeAsposeLicense()
- {
- try
- {
- var licensePath = _configuration["Aspose:LicensePath"];
- if (!string.IsNullOrEmpty(licensePath) && System.IO.File.Exists(licensePath))
- {
- var license = new License();
- license.SetLicense(licensePath);
- _logger.LogInformation("Aspose.Words 15.12.0 许可证已成功加载");
- }
- else
- {
- _logger.LogWarning("Aspose.Words许可证文件未找到,将使用评估模式");
- }
- }
- catch (Exception ex)
- {
- _logger.LogError(ex, "Aspose.Words许可证初始化失败");
- }
- }
- /// <summary>
- /// 处理Word文档并提取结构化信息
- /// </summary>
- public async Task<ProcessingResult> ProcessWordDocumentAsync(IFormFile file)
- {
- var stopwatch = System.Diagnostics.Stopwatch.StartNew();
- try
- {
- _logger.LogInformation("开始处理Word文档: {FileName} ({Size} bytes)",
- file.FileName, file.Length);
- if (!IsSupportedWordFormat(file.FileName))
- {
- return new ProcessingResult
- {
- Success = false,
- ErrorMessage = $"不支持的文件格式: {Path.GetExtension(file.FileName)}",
- FileSize = file.Length,
- FileType = file.ContentType
- };
- }
- using var stream = new MemoryStream();
- await file.CopyToAsync(stream);
- stream.Position = 0;
- var documentInfo = await ExtractDocumentInfoAsync(stream, file.FileName);
- stopwatch.Stop();
- _logger.LogInformation("Word文档处理完成: {FileName}, 耗时: {ElapsedMs}ms",
- file.FileName, stopwatch.ElapsedMilliseconds);
- return new ProcessingResult
- {
- Success = true,
- Data = documentInfo,
- FileSize = file.Length,
- FileType = file.ContentType
- };
- }
- catch (Exception ex)
- {
- stopwatch.Stop();
- _logger.LogError(ex, "Word文档处理失败: {FileName}", file.FileName);
- return new ProcessingResult
- {
- Success = false,
- ErrorMessage = $"处理失败: {ex.Message}",
- FileSize = file.Length,
- FileType = file.ContentType
- };
- }
- }
- /// <summary>
- /// 批量处理Word文档
- /// </summary>
- public async Task<List<ProcessingResult>> ProcessWordDocumentsAsync(List<IFormFile> files)
- {
- var results = new List<ProcessingResult>();
- var tasks = files.Select(ProcessWordDocumentAsync).ToList();
- var batchResults = await Task.WhenAll(tasks);
- results.AddRange(batchResults);
- return results;
- }
- /// <summary>
- /// 从流中提取文档信息
- /// </summary>
- public async Task<WordDocumentInfo> ExtractDocumentInfoAsync(Stream stream, string fileName)
- {
- var stopwatch = System.Diagnostics.Stopwatch.StartNew();
- try
- {
- // 加载Word文档
- var doc = new Document(stream);
- var documentInfo = new WordDocumentInfo
- {
- Title = ExtractDocumentTitle(doc),
- Content = ExtractTextContent(doc),
- Metadata = ExtractMetadata(doc),
- Tables = ExtractTables(doc),
- FormFields = ExtractFormFields(doc),
- Sections = ExtractSections(doc),
- ImagesCount = CountImages(doc)
- };
- stopwatch.Stop();
- documentInfo.ProcessingTimeMs = stopwatch.ElapsedMilliseconds;
- return documentInfo;
- }
- catch (Exception ex)
- {
- _logger.LogError(ex, "文档信息提取失败: {FileName}", fileName);
- throw;
- }
- }
- /// <summary>
- /// 提取文档标题
- /// </summary>
- private string ExtractDocumentTitle(Document doc)
- {
- try
- {
- // 首先尝试从文档属性获取标题
- var title = doc.BuiltInDocumentProperties.Title;
- if (!string.IsNullOrEmpty(title))
- return title;
- // 如果没有标题,尝试从第一个段落提取
- foreach (Aspose.Words.Paragraph paragraph in doc.GetChildNodes(NodeType.Paragraph, true))
- {
- var text = paragraph.GetText().Trim();
- if (!string.IsNullOrEmpty(text) && text.Length < 100) // 假设标题不会太长
- return text;
- }
- return "未命名文档";
- }
- catch (Exception ex)
- {
- _logger.LogWarning(ex, "提取文档标题失败");
- return "未命名文档";
- }
- }
- /// <summary>
- /// 提取文本内容
- /// </summary>
- private string ExtractTextContent(Document doc)
- {
- try
- {
- // 使用Aspose.Words的GetText方法提取纯文本
- return doc.GetText();
- }
- catch (Exception ex)
- {
- _logger.LogWarning(ex, "提取文本内容失败");
- return string.Empty;
- }
- }
- /// <summary>
- /// 提取文档元数据
- /// </summary>
- private DocumentMetadata ExtractMetadata(Document doc)
- {
- try
- {
- var props = doc.BuiltInDocumentProperties;
- return new DocumentMetadata
- {
- Author = props.Author ?? string.Empty,
- Company = props.Company ?? string.Empty,
- CreatedTime = props.CreatedTime,
- LastSavedTime = props.LastSavedTime,
- PageCount = doc.PageCount,
- WordCount = props.Words,
- CharacterCount = props.Characters,
- Subject = props.Subject ?? string.Empty,
- Keywords = props.Keywords ?? string.Empty
- };
- }
- catch (Exception ex)
- {
- _logger.LogWarning(ex, "提取元数据失败");
- return new DocumentMetadata();
- }
- }
- /// <summary>
- /// 提取表格数据
- /// </summary>
- private List<DocumentTable> ExtractTables(Document doc)
- {
- var tables = new List<DocumentTable>();
- try
- {
- int tableIndex = 1;
- foreach (Aspose.Words.Tables.Table table in doc.GetChildNodes(NodeType.Table, true))
- {
- var docTable = new DocumentTable
- {
- TableName = $"表格_{tableIndex}"
- };
- // 提取表头(假设第一行是表头)
- if (table.Rows.Count > 0)
- {
- var firstRow = table.FirstRow;
- foreach (Cell cell in firstRow.Cells)
- {
- docTable.Headers.Add(cell.GetText().Trim());
- }
- }
- // 提取所有行数据
- foreach (Row row in table.Rows)
- {
- var rowData = new List<string>();
- foreach (Cell cell in row.Cells)
- {
- rowData.Add(cell.GetText().Trim());
- }
- docTable.Rows.Add(rowData);
- }
- tables.Add(docTable);
- tableIndex++;
- }
- }
- catch (Exception ex)
- {
- _logger.LogWarning(ex, "提取表格数据失败");
- }
- return tables;
- }
- /// <summary>
- /// 提取表单字段
- /// </summary>
- private List<FormField> ExtractFormFields(Document doc)
- {
- var formFields = new List<FormField>();
- try
- {
- foreach (FormField formField in doc.Range.FormFields)
- {
- formFields.Add(new FormField
- {
- Name = formField.Name,
- Type = formField.Type.ToString()
- //,
- //Value = formField.Result ?? string.Empty,
- //IsChecked = formField.Checked
- });
- }
- }
- catch (Exception ex)
- {
- _logger.LogWarning(ex, "提取表单字段失败");
- }
- return formFields;
- }
- /// <summary>
- /// 提取章节信息
- /// </summary>
- private List<DocumentSection> ExtractSections(Document doc)
- {
- var sections = new List<DocumentSection>();
- try
- {
- int sectionIndex = 1;
- foreach (Section section in doc.Sections)
- {
- var docSection = new DocumentSection
- {
- SectionNumber = sectionIndex,
- Content = section.GetText(),
- ParagraphsCount = section.Body.Paragraphs.Count,
- TablesCount = section.Body.Tables.Count
- };
- sections.Add(docSection);
- sectionIndex++;
- }
- }
- catch (Exception ex)
- {
- _logger.LogWarning(ex, "提取章节信息失败");
- }
- return sections;
- }
- /// <summary>
- /// 统计图片数量
- /// </summary>
- private int CountImages(Document doc)
- {
- try
- {
- int imageCount = 0;
- foreach (Aspose.Words.Drawing.Shape shape in doc.GetChildNodes(NodeType.Shape, true))
- {
- if (shape.HasImage)
- {
- imageCount++;
- }
- }
- return imageCount;
- }
- catch (Exception ex)
- {
- _logger.LogWarning(ex, "统计图片数量失败");
- return 0;
- }
- }
- /// <summary>
- /// 检查文件是否为支持的Word格式
- /// </summary>
- public bool IsSupportedWordFormat(string fileName)
- {
- var extension = Path.GetExtension(fileName).ToLower();
- return extension switch
- {
- ".doc" => true,
- ".docx" => true,
- ".dot" => true,
- ".dotx" => true,
- ".docm" => true,
- ".dotm" => true,
- _ => false
- };
- }
- }
- }
|