from langchain_text_splitters importCharacterTextSplitter sample_text =("LangChain was created by Harrison Chase in 2022. It provides a framework for developing applications ""powered by language models. The library is known for its modularity and ease of use. ""One of its key components is the TextSplitter class, which helps in document chunking.") text_splitter =CharacterTextSplitter( separator =" ",# 按空格分割 chunk_size=100,# 增大块大小 chunk_overlap=20,# 调整重叠比例 length_function=len,) docs = text_splitter.create_documents([sample_text])for i, doc in enumerate(docs):print(f"--- Chunk {i+1} ---")print(doc.page_content)
import nltktry: nltk.data.find('tokenizers/punkt')except nltk.downloader.DownloadError: nltk.download('punkt') from nltk.tokenize import sent_tokenize def chunk_by_sentences(text, max_chars=500, overlap_sentences=1): sentences = sent_tokenize(text) chunks =[] current_chunk =""for i, sentence in enumerate(sentences):if len(current_chunk)+ len(sentence)<= max_chars: current_chunk +=" "+ sentenceelse: chunks.append(current_chunk.strip())# 创建重叠 start_index = max(0, i - overlap_sentences) current_chunk =" ".join(sentences[start_index:i+1])if current_chunk: chunks.append(current_chunk.strip())return chunks long_text ="This is the first sentence. This is the second sentence, which is a bit longer. Now we have a third one. The fourth sentence follows. Finally, the fifth sentence concludes this paragraph."chunks = chunk_by_sentences(long_text, max_chars=100)for i, chunk in enumerate(chunks):print(f"--- Chunk {i+1} ---")print(chunk)
from langchain_text_splitters importMarkdownHeaderTextSplitter markdown_document ="""# Chapter 1: The Beginning ## Section 1.1: The Old WorldThisis the story of a time long past. ## Section 1.2: A New HopeA new hero emerges. # Chapter 2: The Journey ## Section 2.1: The Call to AdventureThe hero receives a mysterious call.""" headers_to_split_on =[("#","Header 1"),("##","Header 2"),] markdown_splitter =MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)md_header_splits = markdown_splitter.split_text(markdown_document) for split in md_header_splits:print(f"Metadata: {split.metadata}")print(split.page_content)print("-"*20)
2.2.2 对话式分块
•核心思想:根据对话的发言人或轮次进行分块。•适用场景:客服对话、访谈记录、会议纪要。
1 2 3
dialogue =["Alice: Hi, I'm having trouble with my order.","Bot: I can help with that. What's your order number?","Alice: It's 12345.","Alice: I haven't received any shipping updates.","Bot: Let me check... It seems your order was shipped yesterday.","Alice: Oh, great! Thank you.",] def chunk_dialogue(dialogue_lines, max_turns_per_chunk=3): chunks =[]for i in range(0, len(dialogue_lines), max_turns_per_chunk): chunk ="\n".join(dialogue_lines[i:i + max_turns_per_chunk]) chunks.append(chunk)return chunks chunks = chunk_dialogue(dialogue)for i, chunk in enumerate(chunks):print(f"--- Chunk {i+1} ---")print(chunk)
import osfrom langchain_experimental.text_splitter importSemanticChunkerfrom langchain_huggingface importHuggingFaceEmbeddings os.environ["TOKENIZERS_PARALLELISM"]="false"embeddings =HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # 创建 SemanticChunker 实例# LangChain 的 SemanticChunker 默认使用 percentile 阈值# 可以尝试不同的 breakpoint_threshold_type: "percentile", "standard_deviation", "interquartile", "gradient"text_splitter =SemanticChunker( embeddings, breakpoint_threshold_type="percentile",# 使用百分位作为阈值类型 breakpoint_threshold_amount=70# 设置阈值为80)print("SemanticChunker configured.")print("-"*50) long_text =("The Wright brothers, Orville and Wilbur, were two American aviation pioneers ""generally credited with inventing, building, and flying the world's first successful motor-operated airplane. ""They made the first controlled, sustained flight of a powered, heavier-than-air aircraft on December 17, 1903. ""In the following years, they continued to develop their aircraft. ""Switching topics completely, let's talk about cooking. ""A good pizza starts with a perfect dough, which needs yeast, flour, water, and salt. ""The sauce is typically tomato-based, seasoned with herbs like oregano and basil. ""Toppings can vary from simple mozzarella to a wide range of meats and vegetables. ""Finally, let's consider the solar system. ""It is a gravitationally bound system of the Sun and the objects that orbit it. ""The largest objects are the eight planets, in order from the Sun: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune.") docs = text_splitter.create_documents([long_text]) for i, doc in enumerate(docs):print(f"--- Chunk {i+1} ---")print(doc.page_content)print()