利用C#从pdf文档中批量提取图片和文本

2025-05-03 17:18:13

1、创建VS项目

2、编写提取图片的方法,代码如下:private void ExtractImage(string pdfFil髫潋啜缅e) { PdfReader pdfReader = new PdfReader(pdfFile); for (int pageNumber = 1; pageNumber <= pdfReader.NumberOfPages; pageNumber++) { PdfReader pdf = new PdfReader(pdfFile); PdfDictionary pg = pdf.GetPageN(pageNumber); PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES)); PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)); try { foreach (PdfName name in xobj.Keys) { PdfObject bj = xobj.Get(name); if (obj.IsIndirect()) { PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj); string width = tg.Get(PdfName.WIDTH).ToString(); string height = tg.Get(PdfName.HEIGHT).ToString(); //ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject((GraphicsState)new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg); ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new GraphicsState(), (PRIndirectReference)obj, tg); RenderImage(imgRI); } } }catch { continue; } } }

3、将图片保存到文件private void RenderImage(ImageRenderInfo renderInfo) { count++; PdfImageObject image = renderInfo.GetImage(); using (Dotnet dotnetImg = image.GetDrawingImage()) { if (dotnetImg != null) { using (MemoryStream ms = new MemoryStream()) { dotnetImg.Save(ms, ImageFormat.Tiff); Bitmap d = new Bitmap(dotnetImg); d.Save(@""); } } } }

4、从PDF提取文本public void ExtractTextFromPDFPage(string pdfFile) { PdfReader reader = new PdfReader(pdfFile); int n = reader.NumberOfPages; for (int i = 1; i <= n; i++) { string text = PdfTextExtractor.GetTextFromPage(reader, i); } try { reader.Close(); } catch { } }

声明:本网站引用、摘录或转载内容仅供网站访问者交流或参考,不代表本站立场,如存在版权或非法内容,请联系站长删除,联系邮箱:site.kefu@qq.com。
猜你喜欢