C# PDF Text Converter Library
How to read, convert PDF to text files (txt) using C#.net in ASP.NET MVC, WinForms, WPF application
Full featured sample souce code for Converting PDF to Text (TXT) in Visual C# with .NET XDoc.PDF Converter Library. Free Online Trial Download
In this tutorial, you will learn how to convert PDF document to text file (.txt) using C# PDF to Text Conversion library in .NET Windows, ASP.NET MVC web application.
- PDF to Text file (.txt) conversion
- Output PDF converted text file in specified encode, such as UTF-8, Unicode, ASCII, ISO-8859-1
- PDF to Text multiple files conversion
- Convert PDF text content to lines
How to convert PDF document to Text file using C#
- Best Visual C#.NET PDF to text converting SDK for converting PDF to text in Visual Studio .NET project
- Powerful .NET control for batch converting PDF to editable & searchable text formats in C# class
- Free evaluation library for exporting PDF to Text in both C#.NET WinForms application and ASP.NET WebForms
- Support .NET WinForms, ASP.NET MVC in IIS, ASP.NET Ajax, Azure cloud service, DNN (DotNetNuke), SharePoint
- All text content of target PDF document can be copied and pasted to .txt files by keeping original layout
-
You can view more PDF conversion at:
c# pdf to tiff,
c# convert csv to pdf,
c# pdf converter,
c# convert tiff to pdf,
c# convert png to pdf,
c# code to save word document as pdf,
c# pdf to image.
- C#.NET class source code for converting each PDF document page to separate text file
- Text in any fonts, colors and sizes, or highlighted characters are easy to be converted to plain text
- Text can be extracted from scanned PDF image with OCR component
Professional PDF to text converting library from RasterEdge PDF document conversion SDK provides reliable and effective .NET solution for Visual C# developers to
convert PDF document to editable & searchable text file. Different from other C# .NET PDF to text conversion controls, RasterEdge C# PDF to text
converter control toolkit can convert PDF document to text file with good formatting.
Comparison with some other PDF tools convert PDF to text by a method loses the original PDF document layout and all the paragraphs are joining together,
our C# PDF to text converter SDK successfully distinguishes itself from those existing PDF to text conversion software based on its good outputted text file quality.
The outputted text file, converted by our C# PDF to text converting library, is separated by page and all the paragraphs are well retained with nice formatting.
In addition, RasterEdge also provides other industry-leading methods to convert target PDF document to other editable file formats using Visual C# code,
such as, PDF to HTML converter assembly, PDF to Word converter assembly and PDF to PNG converter control.
How to convert PDF document to text file using C# in ASP.NET, Windows application?
In this part, we will tell C# developers how to use RasterEdge PDF to text converting library in Visual C# .NET class application.
What should be noted here is that our PDF to text converting library is built in Visual Studio 2005 and .NET Framework 2.0.
Thus, please make sure you have installed VS 2005 or above versions and .NET Framework 2.0 or greater.
Now you can convert source PDF document to text file using the C# demo code we have offered below.
#region pdf to text (file to file)
internal static void convertPdfToText()
{
String inputFilePath = @"C:\demo.pdf";
String outputFilePath = @"C:\output.txt";
StreamWriter writer = new StreamWriter(outputFilePath);
PDFDocument doc = new PDFDocument(inputFilePath);
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);
int pageCount = doc.GetPageCount();
for (int i = 0; i < pageCount; i++)
{
PDFPage page = (PDFPage)doc.GetPage(i);
List<PDFTextLine> pageTextLines = textMgr.ExtractTextLine(page);
writeTextLines(pageTextLines, writer);
}
writer.Close();
}
#endregion
#region pdf to text (stream to stream)
internal static void convertPdfStreamToText()
{
String inputFilePath = @"C:\demo.pdf";
byte[] arr = File.ReadAllBytes(inputFilePath);
Stream inputStream = new MemoryStream(arr);
Stream stream = new MemoryStream();
StreamWriter writer = new StreamWriter(stream);
PDFDocument doc = new PDFDocument(inputStream);
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);
int pageCount = doc.GetPageCount();
for (int i = 0; i < pageCount; i++)
{
PDFPage page = (PDFPage)doc.GetPage(i);
List<PDFTextLine> pageTextLines = textMgr.ExtractTextLine(page);
writeTextLines(pageTextLines, writer);
}
writer.Close();
}
#endregion
#region write text to stream
private static void writeTextLines(List<PDFTextLine> pageTextLines, StreamWriter writer)
{
String lineText = "";
float positionY = 0f;
float height = 0f;
float positionX = 0f;
#region current page do not contain images
if (pageTextLines != null)
{
for (int i = 0; i < pageTextLines.Count; i++)
{
RectangleF rectangle = pageTextLines[i].GetBoundary();
if (i != 0 && !isEqual(positionY + height, rectangle.Y + rectangle.Height))
{
writer.WriteLine(lineText);
lineText = "";
}
if (positionX > rectangle.X)
{
lineText = getTextLineContent(pageTextLines[i]) + " " + lineText;
}
else
{
lineText += getTextLineContent(pageTextLines[i]);
lineText += " ";
}
positionY = rectangle.Y;
height = rectangle.Height;
positionX = rectangle.X;
if (i == pageTextLines.Count - 1)
{
writer.WriteLine(lineText);
}
}
}
#endregion
writer.WriteLine(" ");
writer.WriteLine(" ");
writer.Flush();
}
private static String getTextLineContent(PDFTextLine pdfTextLine)
{
List<PDFTextWord> words = pdfTextLine.GetTextWord();
String wordText = "";
float positionX = 0;
float width = 0;
for (int i = 0; i < words.Count; i++)
{
RectangleF rectange = words[i].GetBoundary();
if (i != 0 && !isEqual(positionX + width, rectange.X))
wordText += " ";
wordText += words[i].GetContent();
positionX = rectange.X;
width = rectange.Width;
}
return wordText;
}
private static bool isEqual(float first, float second)
{
if (first - second < 2F && first - second > -2F)
return true;
return false;
}
#endregion
How to convert PDF file to text with specific encode using C# code?
Below are the steps and C# example source code to convert an existing PDF file to text with Unicode encode programmatically using C#.
- Create a new Stream object with an existing PDF file loaded
- Utilize DocumentConverter.ToDocument() to convert the PDF in Stream object to a text file in MemoryStream object
- The PDF text converter will always use the UTF-8 encode. Use Encoding.UTF8.GetString() to get the text content
- To get the byte array text in Unicode encode, use the method Encoding.Unicode.GetBytes()
String inputFilePath = @"W:\Projects\Test-Files\unicode-test-file.pdf";
using (Stream inStream = File.Open(inputFilePath, FileMode.Open, FileAccess.Read))
{
using (MemoryStream ms = new MemoryStream())
{
// Convert PDF to TXT from file stream to memory stream.
DocumentConverter.ToDocument(inStream, ms, FileType.DOC_TXT);
// Get String from the data bytes (in UTF-8)
String content = Encoding.UTF8.GetString(ms.ToArray());
// Get data bytes (in Unicode) from String.
byte[] dataBytes = Encoding.Unicode.GetBytes(content);
// ...
}
}
C# convert, make two or multiple pdf files to text (batch conversion)
#region pdf to text (batch files)
internal static void convertPdfFilesToText()
{
String inputDirectory = @"C:\input\";
String outputDirectory = @"C:\output\";
String[] files = Directory.GetFiles(inputDirectory, "*.pdf");
foreach (String filePath in files)
{
int startIdx = filePath.LastIndexOf("\\");
int endIdx = filePath.LastIndexOf(".");
String docName = filePath.Substring(startIdx + 1, endIdx - startIdx - 1);
StreamWriter writer = new StreamWriter(outputDirectory + docName + ".txt");
PDFDocument doc = new PDFDocument(filePath);
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);
int pageCount = doc.GetPageCount();
for (int i = 0; i < pageCount; i++)
{
PDFPage page = (PDFPage)doc.GetPage(i);
List<PDFTextLine> pageTextLines = textMgr.ExtractTextLine(page);
writeTextLines(pageTextLines, writer);
}
writer.Close();
}
}
#endregion
C# combine multiple pdf files, and change to text
#region combine pdf files and convert to text
internal static void combineAndConvertToText()
{
String[] files = new String[] { @"C:\demo1.pdf", @"C:\demo2.pdf", @"C:\demo3.pdf" };
Stream inputStream = new MemoryStream();
PDFDocument.CombineDocument(files, inputStream);
Stream stream = new MemoryStream();
StreamWriter writer = new StreamWriter(stream);
PDFDocument doc = new PDFDocument(inputStream);
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);
int pageCount = doc.GetPageCount();
for (int i = 0; i < pageCount; i++)
{
PDFPage page = (PDFPage)doc.GetPage(i);
List<PDFTextLine> pageTextLines = textMgr.ExtractTextLine(page);
writeTextLines(pageTextLines, writer);
}
writer.Close();
}
#endregion