using Aspose.Words;
using Aspose.Words.Tables;
using Aspose.Words.Fields;
using System.Text;
namespace OASystem.API.OAMethodLib.FileProcessing
{
///
/// 文件处理服务,使用Aspose.Words处理Word文档
///
public class FileProcessingService : IFileProcessingService
{
private readonly ILogger _logger;
private readonly IConfiguration _configuration;
public FileProcessingService(
ILogger logger,
IConfiguration configuration)
{
_logger = logger;
_configuration = configuration;
InitializeAsposeLicense();
}
///
/// 初始化Aspose.Words许可证
///
private void InitializeAsposeLicense()
{
try
{
var licensePath = _configuration["Aspose:LicensePath"];
if (!string.IsNullOrEmpty(licensePath) && System.IO.File.Exists(licensePath))
{
var license = new License();
license.SetLicense(licensePath);
_logger.LogInformation("Aspose.Words 15.12.0 许可证已成功加载");
}
else
{
_logger.LogWarning("Aspose.Words许可证文件未找到,将使用评估模式");
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Aspose.Words许可证初始化失败");
}
}
///
/// 处理Word文档并提取结构化信息
///
public async Task ProcessWordDocumentAsync(IFormFile file)
{
var stopwatch = System.Diagnostics.Stopwatch.StartNew();
try
{
_logger.LogInformation("开始处理Word文档: {FileName} ({Size} bytes)",
file.FileName, file.Length);
if (!IsSupportedWordFormat(file.FileName))
{
return new ProcessingResult
{
Success = false,
ErrorMessage = $"不支持的文件格式: {Path.GetExtension(file.FileName)}",
FileSize = file.Length,
FileType = file.ContentType
};
}
using var stream = new MemoryStream();
await file.CopyToAsync(stream);
stream.Position = 0;
var documentInfo = await ExtractDocumentInfoAsync(stream, file.FileName);
stopwatch.Stop();
_logger.LogInformation("Word文档处理完成: {FileName}, 耗时: {ElapsedMs}ms",
file.FileName, stopwatch.ElapsedMilliseconds);
return new ProcessingResult
{
Success = true,
Data = documentInfo,
FileSize = file.Length,
FileType = file.ContentType
};
}
catch (Exception ex)
{
stopwatch.Stop();
_logger.LogError(ex, "Word文档处理失败: {FileName}", file.FileName);
return new ProcessingResult
{
Success = false,
ErrorMessage = $"处理失败: {ex.Message}",
FileSize = file.Length,
FileType = file.ContentType
};
}
}
///
/// 批量处理Word文档
///
public async Task> ProcessWordDocumentsAsync(List files)
{
var results = new List();
var tasks = files.Select(ProcessWordDocumentAsync).ToList();
var batchResults = await Task.WhenAll(tasks);
results.AddRange(batchResults);
return results;
}
///
/// 从流中提取文档信息
///
public async Task ExtractDocumentInfoAsync(Stream stream, string fileName)
{
var stopwatch = System.Diagnostics.Stopwatch.StartNew();
try
{
// 加载Word文档
var doc = new Document(stream);
var documentInfo = new WordDocumentInfo
{
Title = ExtractDocumentTitle(doc),
Content = ExtractTextContent(doc),
Metadata = ExtractMetadata(doc),
Tables = ExtractTables(doc),
FormFields = ExtractFormFields(doc),
Sections = ExtractSections(doc),
ImagesCount = CountImages(doc)
};
stopwatch.Stop();
documentInfo.ProcessingTimeMs = stopwatch.ElapsedMilliseconds;
return documentInfo;
}
catch (Exception ex)
{
_logger.LogError(ex, "文档信息提取失败: {FileName}", fileName);
throw;
}
}
///
/// 提取文档标题
///
private string ExtractDocumentTitle(Document doc)
{
try
{
// 首先尝试从文档属性获取标题
var title = doc.BuiltInDocumentProperties.Title;
if (!string.IsNullOrEmpty(title))
return title;
// 如果没有标题,尝试从第一个段落提取
foreach (Aspose.Words.Paragraph paragraph in doc.GetChildNodes(NodeType.Paragraph, true))
{
var text = paragraph.GetText().Trim();
if (!string.IsNullOrEmpty(text) && text.Length < 100) // 假设标题不会太长
return text;
}
return "未命名文档";
}
catch (Exception ex)
{
_logger.LogWarning(ex, "提取文档标题失败");
return "未命名文档";
}
}
///
/// 提取文本内容
///
private string ExtractTextContent(Document doc)
{
try
{
// 使用Aspose.Words的GetText方法提取纯文本
return doc.GetText();
}
catch (Exception ex)
{
_logger.LogWarning(ex, "提取文本内容失败");
return string.Empty;
}
}
///
/// 提取文档元数据
///
private DocumentMetadata ExtractMetadata(Document doc)
{
try
{
var props = doc.BuiltInDocumentProperties;
return new DocumentMetadata
{
Author = props.Author ?? string.Empty,
Company = props.Company ?? string.Empty,
CreatedTime = props.CreatedTime,
LastSavedTime = props.LastSavedTime,
PageCount = doc.PageCount,
WordCount = props.Words,
CharacterCount = props.Characters,
Subject = props.Subject ?? string.Empty,
Keywords = props.Keywords ?? string.Empty
};
}
catch (Exception ex)
{
_logger.LogWarning(ex, "提取元数据失败");
return new DocumentMetadata();
}
}
///
/// 提取表格数据
///
private List ExtractTables(Document doc)
{
var tables = new List();
try
{
int tableIndex = 1;
foreach (Aspose.Words.Tables.Table table in doc.GetChildNodes(NodeType.Table, true))
{
var docTable = new DocumentTable
{
TableName = $"表格_{tableIndex}"
};
// 提取表头(假设第一行是表头)
if (table.Rows.Count > 0)
{
var firstRow = table.FirstRow;
foreach (Cell cell in firstRow.Cells)
{
docTable.Headers.Add(cell.GetText().Trim());
}
}
// 提取所有行数据
foreach (Row row in table.Rows)
{
var rowData = new List();
foreach (Cell cell in row.Cells)
{
rowData.Add(cell.GetText().Trim());
}
docTable.Rows.Add(rowData);
}
tables.Add(docTable);
tableIndex++;
}
}
catch (Exception ex)
{
_logger.LogWarning(ex, "提取表格数据失败");
}
return tables;
}
///
/// 提取表单字段
///
private List ExtractFormFields(Document doc)
{
var formFields = new List();
try
{
foreach (FormField formField in doc.Range.FormFields)
{
formFields.Add(new FormField
{
Name = formField.Name,
Type = formField.Type.ToString()
//,
//Value = formField.Result ?? string.Empty,
//IsChecked = formField.Checked
});
}
}
catch (Exception ex)
{
_logger.LogWarning(ex, "提取表单字段失败");
}
return formFields;
}
///
/// 提取章节信息
///
private List ExtractSections(Document doc)
{
var sections = new List();
try
{
int sectionIndex = 1;
foreach (Section section in doc.Sections)
{
var docSection = new DocumentSection
{
SectionNumber = sectionIndex,
Content = section.GetText(),
ParagraphsCount = section.Body.Paragraphs.Count,
TablesCount = section.Body.Tables.Count
};
sections.Add(docSection);
sectionIndex++;
}
}
catch (Exception ex)
{
_logger.LogWarning(ex, "提取章节信息失败");
}
return sections;
}
///
/// 统计图片数量
///
private int CountImages(Document doc)
{
try
{
int imageCount = 0;
foreach (Aspose.Words.Drawing.Shape shape in doc.GetChildNodes(NodeType.Shape, true))
{
if (shape.HasImage)
{
imageCount++;
}
}
return imageCount;
}
catch (Exception ex)
{
_logger.LogWarning(ex, "统计图片数量失败");
return 0;
}
}
///
/// 检查文件是否为支持的Word格式
///
public bool IsSupportedWordFormat(string fileName)
{
var extension = Path.GetExtension(fileName).ToLower();
return extension switch
{
".doc" => true,
".docx" => true,
".dot" => true,
".dotx" => true,
".docm" => true,
".dotm" => true,
_ => false
};
}
}
}