XinXiBu
/
OA2023


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
							using Aspose.Words;
using Aspose.Words.Tables;

/// <summary>
/// 签证文件识别服务类
/// </summary>
public class VisaFormExtractor
{
    public VisaApplicationData ExtractData(string filePath)
    {
        var data = new VisaApplicationData();
        data.FileName = System.IO.Path.GetFileName(filePath);

        // 加载文档
        Document doc = new Document(filePath);

        // 获取文档中的所有表格
        Table[] tables = doc.GetChildNodes(NodeType.Table, true).ToArray().Cast<Table>().ToArray();

        // 根据文件名或内容判断签证类型，并分发给不同的提取器
        // 这里是一个简单的判断逻辑
        if (data.FileName.Contains("澳新") || data.FileName.Contains("澳大利亚") || data.FileName.Contains("新西兰"))
        {
            data.VisaType = "ANZ";
            ExtractANZData(tables, data.ExtractedFields);
        }
        else if (data.FileName.Contains("美国"))
        {
            data.VisaType = "USA";
            ExtractUSAData(tables, data.ExtractedFields);
        }
        else if (data.FileName.Contains("申根"))
        {
            data.VisaType = "Schengen";
            ExtractSchengenData(doc, data.ExtractedFields); // 申根表可能不全是表格
        }
        // ... 添加其他签证类型的判断和提取方法

        return data;
    }

    /// <summary>
    /// 提取澳新签证表数据
    /// </summary>
    private void ExtractANZData(Table[] tables, Dictionary<string, string> fields)
    {
        // 遍历所有表格，寻找关键字段
        foreach (Table table in tables)
        {
            foreach (Row row in table.Rows)
            {
                for (int cellIndex = 0; cellIndex < row.Cells.Count; cellIndex++)
                {
                    Cell cell = row.Cells[cellIndex];
                    string cellText = cell.GetText().Trim().Replace("\a", ""); // 获取单元格文本并清理

                    // 1. 识别“姓名”字段
                    if (cellText.Contains("姓名") && !cellText.Contains("曾用名") && !cellText.Contains("拼音"))
                    {
                        // 假设“姓名”的值在同行下一个单元格
                        string value = GetAdjacentCellValue(row, cellIndex, isNext: true);
                        if (!string.IsNullOrEmpty(value))
                        {
                            fields["姓名"] = value;
                        }
                    }

                    // 2. 识别“出生日期”字段
                    if (cellText.Contains("出生日期"))
                    {
                        // 假设“出生日期”的值在同行下一个单元格
                        string value = GetAdjacentCellValue(row, cellIndex, isNext: true);
                        if (!string.IsNullOrEmpty(value))
                        {
                            // 使用正则表达式验证并清理日期格式
                            var dateMatch = Regex.Match(value, @"\d{4}年\d{1,2}月\d{1,2}日");
                            if (dateMatch.Success)
                            {
                                fields["出生日期"] = dateMatch.Value;
                            }
                            else
                            {
                                fields["出生日期"] = value; // 如果不是标准格式，也先存下来
                            }
                        }
                    }

                    // 3. 识别“性别”字段 (处理带有选项的情况)
                    if (cellText.Contains("性 别") || cellText.Contains("性别"))
                    {
                        // 这个单元格里可能直接是“男”或“女”，也可能在后面的Run里有带格式的文本
                        string value = GetTextFromCellOrFollowingRuns(cell);
                        // 清理文本，只取“男”或“女”
                        if (value.Contains("男")) fields["性别"] = "男";
                        else if (value.Contains("女")) fields["性别"] = "女";
                    }

                    // 4. 识别“婚姻状况”字段 (处理打勾的情况)
                    if (cellText.Contains("婚姻状况"))
                    {
                        // 方法1： 在单元格内查找被勾选的项 (例如 R已婚 中的 R 可能代表被选中)
                        string innerText = cell.GetText();
                        if (innerText.Contains("R已婚")) fields["婚姻状况"] = "已婚";
                        else if (innerText.Contains("R未婚")) fields["婚姻状况"] = "未婚";
                        // ... 其他选项

                        // 方法2： 如果勾选框是符号，可能需要检查特定的Run的字体
                    }
                    // ... 根据您的“澳新表”模板，继续添加更多字段的提取逻辑
                    // 例如：现单位名称、月收入、家庭成员、教育经历等。
                    // 这些字段可能需要更复杂的逻辑，比如跨行、跨表格读取。
                }
            }
        }
    }

    /// <summary>
    /// 提取美国签证表数据 (逻辑类似，但映射规则不同)
    /// </summary>
    private void ExtractUSAData(Table[] tables, Dictionary<string, string> fields)
    {
        foreach (Table table in tables)
        {
            foreach (Row row in table.Rows)
            {
                for (int cellIndex = 0; cellIndex < row.Cells.Count; cellIndex++)
                {
                    Cell cell = row.Cells[cellIndex];
                    string cellText = cell.GetText().Trim().Replace("\a", "");

                    // 美国表的“姓名”字段可能直接就是“姓名：”
                    if (cellText.StartsWith("姓名："))
                    {
                        // 值可能在同一个单元格的“：”后面，也可能在下一个单元格
                        string value = cellText.Split('：').Last().Trim();
                        if (string.IsNullOrEmpty(value))
                        {
                            value = GetAdjacentCellValue(row, cellIndex, isNext: true);
                        }
                        fields["姓名"] = value;
                    }

                    if (cellText.StartsWith("出生日期："))
                    {
                        string value = cellText.Split('：').Last().Trim();
                        if (string.IsNullOrEmpty(value))
                        {
                            value = GetAdjacentCellValue(row, cellIndex, isNext: true);
                        }
                        fields["出生日期"] = value;
                    }
                    // ... 添加美国表特有的字段，如“护照号码”、“DS160确认号”、“美国联系人”等。
                }
            }
        }
    }

    /// <summary>
    /// 提取申根表数据 (可能包含段落)
    /// </summary>
    private void ExtractSchengenData(Document doc, Dictionary<string, string> fields)
    {
        // 申根表可能不完全是表格，需要结合段落(Paragraph)来搜索
        NodeCollection paragraphs = doc.GetChildNodes(NodeType.Paragraph, true);

        foreach (Paragraph para in paragraphs)
        {
            string paraText = para.GetText().Trim();

            if (paraText.StartsWith("1.姓名："))
            {
                // 提取姓名，可能在冒号后面，也可能在下一行或下一个Run
                fields["姓名"] = paraText.Replace("1.姓名：", "").Trim();
            }
            // ... 处理其他申根表字段
        }

        // 同时也检查表格
        Table[] tables = doc.GetChildNodes(NodeType.Table, true).ToArray().Cast<Table>().ToArray();
        // ... 类似ANZ和USA的方法，但使用申根表的映射规则
    }

    // --- 辅助方法 ---

    /// <summary>
    /// 获取同一行中相邻单元格的值
    /// </summary>
    /// <param name="row">当前行</param>
    /// <param name="currentCellIndex">当前单元格索引</param>
    /// <param name="isNext">true=下一个单元格，false=上一个单元格</param>
    /// <returns>相邻单元格的文本</returns>
    private string GetAdjacentCellValue(Row row, int currentCellIndex, bool isNext = true)
    {
        int targetIndex = isNext ? currentCellIndex + 1 : currentCellIndex - 1;
        if (targetIndex >= 0 && targetIndex < row.Cells.Count)
        {
            return row.Cells[targetIndex].GetText().Trim().Replace("\a", "");
        }
        return null;
    }

    /// <summary>
    /// 从一个单元格及其子Run中提取文本，有助于获取格式化的文本（如被勾选的项）
    /// </summary>
    private string GetTextFromCellOrFollowingRuns(Cell cell)
    {
        // 这是一个简化的版本，实际中可能需要递归遍历所有子节点
        return cell.GetText().Trim().Replace("\a", "");
    }

    // 这是一个通用模型，您需要为每种签证类型定义更详细的模型
    public class VisaApplicationData
    {
        public string FileName { get; set; }
        public string VisaType { get; set; } // 可根据文件名判断
        public Dictionary<string, string> ExtractedFields { get; set; } = new Dictionary<string, string>();
    }
}