C# PDF Text Reader Library
How to read, parse, convert, extract PDF to text files using C#.net in ASP.NET MVC, WinForms, WPF application
Full featured sample souce code for Converting PDF to Text (TXT) in Visual C# with .NET XDoc.PDF Converter Library. Free Online Trial Download
- Best Visual C#.NET PDF to text converting SDK for converting PDF to text in Visual Studio .NET project
- Powerful .NET control for batch converting PDF to editable & searchable text formats in C# class
- Free evaluation library for exporting PDF to Text in both C#.NET WinForms application and ASP.NET WebForms
- Support .NET WinForms, ASP.NET MVC in IIS, ASP.NET Ajax, Azure cloud service, DNN (DotNetNuke), SharePoint
- All text content of target PDF document can be copied and pasted to .txt files by keeping original layout
- C#.NET class source code for converting each PDF document page to separate text file
- Text in any fonts, colors and sizes, or highlighted characters are easy to be converted to plain text
- Text can be extracted from scanned PDF image with OCR component
Professional PDF to text converting library from RasterEdge PDF document conversion SDK provides reliable and effective .NET solution for Visual C# developers to
convert PDF document to editable & searchable text file. Different from other C# .NET PDF to text conversion controls, RasterEdge C# PDF to text
converter control toolkit can convert PDF document to text file with good formatting.
Comparison with some other PDF tools convert PDF to text by a method loses the original PDF document layout and all the paragraphs are joining together,
our C# PDF to text converter SDK successfully distinguishes itself from those existing PDF to text conversion software based on its good outputted text file quality.
The outputted text file, converted by our C# PDF to text converting library, is separated by page and all the paragraphs are well retained with nice formatting.
In addition, RasterEdge also provides other industry-leading methods to convert target PDF document to other editable file formats using Visual C# code,
such as, PDF to HTML converter assembly, PDF to Word converter assembly and PDF to PNG converter control.
C#.NET DLLs: Use PDF to Text Converter Control in C#.NET
In this part, we will tell C# developers how to use RasterEdge PDF to text converting library in Visual C# .NET class application.
What should be noted here is that our PDF to text converting library is built in Visual Studio 2005 and .NET Framework 2.0.
Thus, please make sure you have installed VS 2005 or above versions and .NET Framework 2.0 or greater.
Now you can convert source PDF document to text file using the C# demo code we have offered below.
#region pdf to text (file to file)
internal static void convertPdfToText()
{
String inputFilePath = @"C:\demo.pdf";
String outputFilePath = @"C:\output.txt";
StreamWriter writer = new StreamWriter(outputFilePath);
PDFDocument doc = new PDFDocument(inputFilePath);
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);
int pageCount = doc.GetPageCount();
for (int i = 0; i < pageCount; i++)
{
PDFPage page = (PDFPage)doc.GetPage(i);
List<PDFTextLine> pageTextLines = textMgr.ExtractTextLine(page);
writeTextLines(pageTextLines, writer);
}
writer.Close();
}
#endregion
#region pdf to text (stream to stream)
internal static void convertPdfStreamToText()
{
String inputFilePath = @"C:\demo.pdf";
byte[] arr = File.ReadAllBytes(inputFilePath);
Stream inputStream = new MemoryStream(arr);
Stream stream = new MemoryStream();
StreamWriter writer = new StreamWriter(stream);
PDFDocument doc = new PDFDocument(inputStream);
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);
int pageCount = doc.GetPageCount();
for (int i = 0; i < pageCount; i++)
{
PDFPage page = (PDFPage)doc.GetPage(i);
List<PDFTextLine> pageTextLines = textMgr.ExtractTextLine(page);
writeTextLines(pageTextLines, writer);
}
writer.Close();
}
#endregion
#region write text to stream
private static void writeTextLines(List<PDFTextLine> pageTextLines, StreamWriter writer)
{
String lineText = "";
float positionY = 0f;
float height = 0f;
float positionX = 0f;
#region current page do not contain images
if (pageTextLines != null)
{
for (int i = 0; i < pageTextLines.Count; i++)
{
RectangleF rectangle = pageTextLines[i].GetBoundary();
if (i != 0 && !isEqual(positionY + height, rectangle.Y + rectangle.Height))
{
writer.WriteLine(lineText);
lineText = "";
}
if (positionX > rectangle.X)
{
lineText = getTextLineContent(pageTextLines[i]) + " " + lineText;
}
else
{
lineText += getTextLineContent(pageTextLines[i]);
lineText += " ";
}
positionY = rectangle.Y;
height = rectangle.Height;
positionX = rectangle.X;
if (i == pageTextLines.Count - 1)
{
writer.WriteLine(lineText);
}
}
}
#endregion
writer.WriteLine(" ");
writer.WriteLine(" ");
writer.Flush();
}
private static String getTextLineContent(PDFTextLine pdfTextLine)
{
List<PDFTextWord> words = pdfTextLine.GetTextWord();
String wordText = "";
float positionX = 0;
float width = 0;
for (int i = 0; i < words.Count; i++)
{
RectangleF rectange = words[i].GetBoundary();
if (i != 0 && !isEqual(positionX + width, rectange.X))
wordText += " ";
wordText += words[i].GetContent();
positionX = rectange.X;
width = rectange.Width;
}
return wordText;
}
private static bool isEqual(float first, float second)
{
if (first - second < 2F && first - second > -2F)
return true;
return false;
}
#endregion
C# convert, make two or multiple pdf files to text (batch conversion)
#region pdf to text (batch files)
internal static void convertPdfFilesToText()
{
String inputDirectory = @"C:\input\";
String outputDirectory = @"C:\output\";
String[] files = Directory.GetFiles(inputDirectory, "*.pdf");
foreach (String filePath in files)
{
int startIdx = filePath.LastIndexOf("\\");
int endIdx = filePath.LastIndexOf(".");
String docName = filePath.Substring(startIdx + 1, endIdx - startIdx - 1);
StreamWriter writer = new StreamWriter(outputDirectory + docName + ".txt");
PDFDocument doc = new PDFDocument(filePath);
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);
int pageCount = doc.GetPageCount();
for (int i = 0; i < pageCount; i++)
{
PDFPage page = (PDFPage)doc.GetPage(i);
List<PDFTextLine> pageTextLines = textMgr.ExtractTextLine(page);
writeTextLines(pageTextLines, writer);
}
writer.Close();
}
}
#endregion
C# combine multiple pdf files, and change to text
#region combine pdf files and convert to text
internal static void combineAndConvertToText()
{
String[] files = new String[] { @"C:\demo1.pdf", @"C:\demo2.pdf", @"C:\demo3.pdf" };
Stream inputStream = new MemoryStream();
PDFDocument.CombineDocument(files, inputStream);
Stream stream = new MemoryStream();
StreamWriter writer = new StreamWriter(stream);
PDFDocument doc = new PDFDocument(inputStream);
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);
int pageCount = doc.GetPageCount();
for (int i = 0; i < pageCount; i++)
{
PDFPage page = (PDFPage)doc.GetPage(i);
List<PDFTextLine> pageTextLines = textMgr.ExtractTextLine(page);
writeTextLines(pageTextLines, writer);
}
writer.Close();
}
#endregion